In [None]:
# I could not install Spark on local PyCharm application. 
# That's why I am using Colab as you already suggested in the lecture.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install specific Java and Spark for Python.
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark

Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 https://cloud.r-project.org/bin/linux/ubuntu bi

In [None]:
# DataFrame object will be generated by using SparkSession
# All required libraries are imported
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, PCA, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml.regression import DecisionTreeRegressor

spark = SparkSession.builder.appName("Final-Question-3").getOrCreate()

In [None]:
# I will work on our auto-mpg dataset. 
mpg_df = spark.read.csv("/content/drive/MyDrive/auto-mpg.data.txt", header=True, inferSchema=True, sep=",")
#mpg_df.show()

# Data Preprocessing Part
print("Total number of rows in the data: ",mpg_df.count())
print("Total number of columns in the data: ",len(mpg_df.columns))

# There is no need to use name column which is a complex feature for ML. Let's drop it
mpg_df = mpg_df.drop("name")

# mpg_df.printSchema() #let's see data types of columns if they are correct or not
#it seems that horsepower is string. But it must be in float type
mpg_df = mpg_df.withColumn("horsepower", mpg_df['horsepower'].cast("float")) #We can override on to same column or we can create new column

mpg_df.printSchema()

# To see if there is nonsense value in origin. It is important to check it before applying one-hot encoder
print("Total number of nonsense value in origin column:", mpg_df.filter((mpg_df["origin"]!=1) & (mpg_df["origin"]!=2) & (mpg_df["origin"]!=3)).count())

mpg_df.summary().show() # There are no outliers or null values in this summary table for all columns.
# Also from the perspective of autotive, numbers and distribution of each columns seem correct to me. (No outliers)

# Let's see how much NAN values there exist in the dataset
for i, column in enumerate(mpg_df.columns):
  print(f"For the {i+1} column, the total numbe of NAN values: {mpg_df.where(mpg_df[column].isNull()).count()}")

# It seems that there is no NAN values in the dataset from the above analysis.
# If the analysis above has any mistake, then let's make sure there is no null value in the dataset
print(mpg_df.count()) # it give 398
mpg_df = mpg_df.na.drop(subset=mpg_df.columns) # drop any rows that has null value
print(mpg_df.count()) # it give 392
# So there exist 6 rows with null values for all columns. It is very small number, we can delete this 6 rows. 
# No need to apply imputer for only 6 rows

# There is only one categorical column (discrete): origin. All other columns are continuous. Let's apply one-hot encoder to that column
# Although it seems that Class column is indexed, I will apply StringIndexer in order not to have any issue
strIndexer = StringIndexer(inputCol='origin', outputCol='origin_indexed')
mpg_df = strIndexer.fit(mpg_df).transform(mpg_df)
#mpg_df.show()

encoder = OneHotEncoder(inputCol='origin_indexed', outputCol='origin_encoded')
encoderModel = encoder.fit(mpg_df)
#print(encoderModel.explainParams) #try to print what information encodermodel has
mpg_df = encoderModel.transform(mpg_df)
mpg_df.show()

Total number of rows in the data:  398
Total number of columns in the data:  9
root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- model_year: integer (nullable = true)
 |-- origin: integer (nullable = true)

Total number of nonsense value in origin column: 0
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|               mpg|         cylinders|      displacement|        horsepower|           weight|      acceleration|        model_year|            origin|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|               398|               398|       

In [None]:
mpg_df = mpg_df.withColumnRenamed("mpg", "label")
# No need to use origin column we already have origin_encoded
mpg_df = mpg_df.drop("origin")
mpg_df = mpg_df.drop("origin_indexed")
mpg_df.show()

# Create vector assembler for ML operations
vec =  VectorAssembler(inputCols=mpg_df.columns[1:], outputCol='features_old')
mpg_df = vec.transform(mpg_df)
mpg_df = mpg_df.select('features_old', 'label')
mpg_df.show(truncate=False)

# There are 9 (6+3) features in total since origin_encoded has 3 features itself. This is a big number. In order to decrease it, 
# let's apply PCA which is a type of dimensionality reduction technique
featureNum = 8 # it would be better to examine correlation matrix and then decide this k parameter. For now, take 8 features (it is required in the question)
pca = PCA(inputCol="features_old", outputCol="features", k=featureNum)

mpg_df = pca.fit(mpg_df).transform(mpg_df)
mpg_df = mpg_df.select('features', 'label')
mpg_df.show(truncate=False)

# There are about 400 samples. This is really small set. Since I am planning to use k-fold
# cross validation, I will split the dataset by 0.8 and 0.2
seed = 12345
trainDF, testDF = mpg_df.randomSplit([0.8, 0.2], seed=seed)
# Select (prediction, true label) and compute test error
eval = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")

+-----+---------+------------+----------+------+------------+----------+--------------+
|label|cylinders|displacement|horsepower|weight|acceleration|model_year|origin_encoded|
+-----+---------+------------+----------+------+------------+----------+--------------+
| 18.0|        8|       307.0|     130.0|  3504|        12.0|        70| (2,[0],[1.0])|
| 15.0|        8|       350.0|     165.0|  3693|        11.5|        70| (2,[0],[1.0])|
| 18.0|        8|       318.0|     150.0|  3436|        11.0|        70| (2,[0],[1.0])|
| 16.0|        8|       304.0|     150.0|  3433|        12.0|        70| (2,[0],[1.0])|
| 17.0|        8|       302.0|     140.0|  3449|        10.5|        70| (2,[0],[1.0])|
| 15.0|        8|       429.0|     198.0|  4341|        10.0|        70| (2,[0],[1.0])|
| 14.0|        8|       454.0|     220.0|  4354|         9.0|        70| (2,[0],[1.0])|
| 14.0|        8|       440.0|     215.0|  4312|         8.5|        70| (2,[0],[1.0])|
| 14.0|        8|       455.0|  

In [None]:
# Let's try Decision Tree Regressor
dtc = DecisionTreeRegressor()

model = dtc.fit(trainDF) #Model training !!!!
print('we have found the function !!!!')
print(model.toDebugString)

# Make predictions.
predictions = model.transform(testDF)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

rmse = eval.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

we have found the function !!!!
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_52c9638da79f, depth=5, numNodes=59, numFeatures=8
  If (feature 0 <= -3028.4154293397532)
   If (feature 0 <= -3962.3302177023998)
    If (feature 3 <= -79.03174373144685)
     If (feature 1 <= 150.55370302183292)
      If (feature 0 <= -4357.471315307324)
       Predict: 16.5
      Else (feature 0 > -4357.471315307324)
       Predict: 15.857142857142858
     Else (feature 1 > 150.55370302183292)
      If (feature 0 <= -4357.471315307324)
       Predict: 16.9
      Else (feature 0 > -4357.471315307324)
       Predict: 17.5
    Else (feature 3 > -79.03174373144685)
     If (feature 2 <= -61.305549796403014)
      If (feature 0 <= -4677.36854739191)
       Predict: 9.0
      Else (feature 0 > -4677.36854739191)
       Predict: 10.5
     Else (feature 2 > -61.305549796403014)
      If (feature 5 <= 2.944982027666857)
       Predict: 13.209677419354838
      Else (feature 5 > 2.944982027666857)
       Pr