# Regression Model Tuning (Cross-Validation)

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *

pyspark = SparkSession.builder \
.master("local[4]")\
.appName("RegressionModelTuning")\
.config("spark.executer.memory","4g")\
.config("spark.driver.memory","2g")\
.getOrCreate()

sc = pyspark.sparkContext

In [3]:
life_df = spark.read.format("csv")\
.option("header","True")\
.option("inferSchema", "True")\
.option("sep", ",")\
.load("data/LifeExpectancyData.csv")

In [4]:
life_df.toPandas().head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [5]:
life_df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [6]:
new_column_names = ["Country", "Year", "Status", "label", "AdultMortality",
      "InfantDeaths", "Alcohol", "PercentageExpenditure", "HepatitisB", "Measles", "BMI",
      "UnderFiveDeaths", "Polio", "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP", 
      "Population", "Thinness1-19years", "Thinness5-9years", "IncomeCompositionOfResources", "Schooling"]

In [7]:
life_df = life_df.toDF(*new_column_names)

In [8]:
life_df.toPandas().head(5)

Unnamed: 0,Country,Year,Status,label,AdultMortality,InfantDeaths,Alcohol,PercentageExpenditure,HepatitisB,Measles,...,Polio,TotalExpenditure,Diphtheria,HIV_AIDS,GDP,Population,Thinness1-19years,Thinness5-9years,IncomeCompositionOfResources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


#### [Interpretation]: When we check count numbers above they are changing (2744, 2928, 2938..). Therefore we can say that there has null values

# 1. Data Understanding

### 1.1. Checking of NULL values

In [9]:
count_for_null = 1
for column in life_df.columns:
    if(life_df.filter(col(column).isNull()).count()>0):
        print(count_for_null, ".", column, "--> \033[1;91m there has null values \033[0m")
    else:
        print(count_for_null, ".",column,"-->\033[1;92m is clean \033[0m")
    count_for_null += 1

1 . Country -->[1;92m is clean [0m
2 . Year -->[1;92m is clean [0m
3 . Status -->[1;92m is clean [0m
4 . label --> [1;91m there has null values [0m
5 . AdultMortality --> [1;91m there has null values [0m
6 . InfantDeaths -->[1;92m is clean [0m
7 . Alcohol --> [1;91m there has null values [0m
8 . PercentageExpenditure -->[1;92m is clean [0m
9 . HepatitisB --> [1;91m there has null values [0m
10 . Measles -->[1;92m is clean [0m
11 . BMI --> [1;91m there has null values [0m
12 . UnderFiveDeaths -->[1;92m is clean [0m
13 . Polio --> [1;91m there has null values [0m
14 . TotalExpenditure --> [1;91m there has null values [0m
15 . Diphtheria --> [1;91m there has null values [0m
16 . HIV_AIDS -->[1;92m is clean [0m
17 . GDP --> [1;91m there has null values [0m
18 . Population --> [1;91m there has null values [0m
19 . Thinness1-19years --> [1;91m there has null values [0m
20 . Thinness5-9years --> [1;91m there has null values [0m
21 . IncomeCompositionOfR

### 1.2. Checking of categorical variables weakness

In [10]:
life_df.groupBy(["Country"]).agg({"*":"count"}).sort(col("count(1)").asc()).toPandas().head(15)

Unnamed: 0,Country,count(1)
0,Palau,1
1,Dominica,1
2,San Marino,1
3,Marshall Islands,1
4,Tuvalu,1
5,Monaco,1
6,Nauru,1
7,Saint Kitts and Nevis,1
8,Cook Islands,1
9,Niue,1


### 1.3. Describing of dataset

In [11]:
life_df.describe().toPandas().head()

Unnamed: 0,summary,Country,Year,Status,label,AdultMortality,InfantDeaths,Alcohol,PercentageExpenditure,HepatitisB,...,Polio,TotalExpenditure,Diphtheria,HIV_AIDS,GDP,Population,Thinness1-19years,Thinness5-9years,IncomeCompositionOfResources,Schooling
0,count,2938,2938.0,2938,2928.0,2928.0,2938.0,2744.0,2938.0,2385.0,...,2919.0,2712.0,2919.0,2938.0,2490.0,2286.0,2904.0,2904.0,2771.0,2775.0
1,mean,,2007.5187202178352,,69.22493169398912,164.79644808743168,30.303948264125257,4.6028607871720375,738.2512954533823,80.94046121593291,...,82.55018842069202,5.938189528023592,82.32408359027065,1.742103471749494,7483.158469138481,12753375.120052498,4.8397038567493205,4.870316804407711,0.6275510645976166,11.992792792792786
2,stddev,,4.613840940258099,,9.523867487824305,124.2920790034219,117.92650131339906,4.052412658755658,1987.914858016194,25.070015593018063,...,23.42804594946848,2.498319672155633,23.7169120685726,5.077784531086547,14270.16934151596,61012096.50842794,4.420194947144322,4.508882086983007,0.2109035551515931,3.358919721102356
3,min,Afghanistan,2000.0,Developed,36.3,1.0,0.0,0.01,0.0,1.0,...,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0
4,max,Zimbabwe,2015.0,Developing,89.0,723.0,1800.0,17.87,19479.91161,99.0,...,99.0,17.6,99.0,50.6,119172.7418,1293859294.0,27.7,28.6,0.948,20.7


# 2. Data Cleaning

### 2.1. Handling of the missing values
#### All numeric null values are filled using mean imputation

In [12]:
percentage = (life_df.toPandas().count()/life_df.toPandas().isna().count())*100
print("Percentage of \033[1mNON-NULL and NULL values\033[0m in each columns\n")
print("(NON-NULL rate %,     NULL rate %,    Column_Name)\n")

zipped = zip(percentage, 100-percentage, new_column_names)
sorted_percentage = sorted(zipped)
for i in sorted_percentage:
    print(i)

Percentage of [1mNON-NULL and NULL values[0m in each columns

(NON-NULL rate %,     NULL rate %,    Column_Name)

(77.80803267528931, 22.19196732471069, 'Population')
(81.17767188563649, 18.822328114363515, 'HepatitisB')
(84.75153165418652, 15.248468345813478, 'GDP')
(92.3076923076923, 7.692307692307693, 'TotalExpenditure')
(93.39686861810755, 6.603131381892453, 'Alcohol')
(94.31586113002042, 5.684138869979577, 'IncomeCompositionOfResources')
(94.45200816882233, 5.547991831177669, 'Schooling')
(98.8427501701838, 1.1572498298161946, 'BMI')
(98.8427501701838, 1.1572498298161946, 'Thinness1-19years')
(98.8427501701838, 1.1572498298161946, 'Thinness5-9years')
(99.35330156569094, 0.6466984343090587, 'Diphtheria')
(99.35330156569094, 0.6466984343090587, 'Polio')
(99.65963240299523, 0.3403675970047715, 'AdultMortality')
(99.65963240299523, 0.3403675970047715, 'label')
(100.0, 0.0, 'Country')
(100.0, 0.0, 'HIV_AIDS')
(100.0, 0.0, 'InfantDeaths')
(100.0, 0.0, 'Measles')
(100.0, 0.0, 'Percenta

### Remove string and year columns for calculating mean and filling null values

In [13]:
removed_df = life_df.drop("Country","Status","Year")
removed_df = removed_df.na.drop()
removed_df.count()

1649

### NULL values are filling which were calculated from removed_df

In [14]:
imputed_df = life_df
print("(Column ---> Average value)\n")
for x in imputed_df.columns:
    if(x != "Country" and x != "Status" and x != "Year"):
        mean_value = removed_df.agg(mean(x)).first()[0]
        print(x,"---> ", mean_value)
        imputed_df = imputed_df.na.fill(mean_value, [x])

(Column ---> Average value)

label --->  69.30230442692543
AdultMortality --->  168.2152819890843
InfantDeaths --->  32.55306246209824
Alcohol --->  4.533195876288676
PercentageExpenditure --->  698.973558049698
HepatitisB --->  79.21770770163735
Measles --->  2224.4942389326866
BMI --->  38.1286234081261
UnderFiveDeaths --->  44.22013341419042
Polio --->  83.56458459672528
TotalExpenditure --->  5.955924802910845
Diphtheria --->  84.155245603396
HIV_AIDS --->  1.9838690115221154
GDP --->  5566.031886817592
Population --->  14653625.889484541
Thinness1-19years --->  4.850636749545183
Thinness5-9years --->  4.907762280169795
IncomeCompositionOfResources --->  0.6315512431776829
Schooling --->  12.119890842935117


In [15]:
imputed_df.describe().toPandas().head()

Unnamed: 0,summary,Country,Year,Status,label,AdultMortality,InfantDeaths,Alcohol,PercentageExpenditure,HepatitisB,...,Polio,TotalExpenditure,Diphtheria,HIV_AIDS,GDP,Population,Thinness1-19years,Thinness5-9years,IncomeCompositionOfResources,Schooling
0,count,2938,2938.0,2938,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,...,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0
1,mean,,2007.5187202178352,,69.22519504570099,164.80735194009532,30.303948264125257,4.598260721579346,738.2512954533823,80.57522123893806,...,82.55309734513274,5.939553779937997,82.3349217154527,1.742103471749494,7190.826029084047,13175078.14989256,4.839830377632588,4.87075014211225,0.6277784403031543,11.99984418223232
2,stddev,,4.613840940258099,,9.507641133372625,124.08044190934694,117.92650131339906,3.916325972492945,1987.914858016194,22.59959138679089,...,23.35217064037839,2.4002785013518584,23.6404548509705,5.077784531086547,13154.872365914543,53821257.23701614,4.394535567499202,4.482709506353584,0.2048218354893411,3.2645109829668746
3,min,Afghanistan,2000.0,Developed,36.3,1.0,0.0,0.01,0.0,1.0,...,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0
4,max,Zimbabwe,2015.0,Developing,89.0,723.0,1800.0,17.87,19479.91161,99.0,...,99.0,17.6,99.0,50.6,119172.7418,1293859294.0,27.7,28.6,0.948,20.7


### Checking of NULL values (Does there have still any null value ?)

In [16]:
count_for_null = 1
for column in imputed_df.columns:
    if(imputed_df.filter(col(column).isNull()).count()>0):
        print(count_for_null, ".", column, "--> \033[1;91m there has null values \033[0m")
    else:
        print(count_for_null, ".",column,"-->\033[1;92m is clean \033[0m")
    count_for_null += 1

1 . Country -->[1;92m is clean [0m
2 . Year -->[1;92m is clean [0m
3 . Status -->[1;92m is clean [0m
4 . label -->[1;92m is clean [0m
5 . AdultMortality -->[1;92m is clean [0m
6 . InfantDeaths -->[1;92m is clean [0m
7 . Alcohol -->[1;92m is clean [0m
8 . PercentageExpenditure -->[1;92m is clean [0m
9 . HepatitisB -->[1;92m is clean [0m
10 . Measles -->[1;92m is clean [0m
11 . BMI -->[1;92m is clean [0m
12 . UnderFiveDeaths -->[1;92m is clean [0m
13 . Polio -->[1;92m is clean [0m
14 . TotalExpenditure -->[1;92m is clean [0m
15 . Diphtheria -->[1;92m is clean [0m
16 . HIV_AIDS -->[1;92m is clean [0m
17 . GDP -->[1;92m is clean [0m
18 . Population -->[1;92m is clean [0m
19 . Thinness1-19years -->[1;92m is clean [0m
20 . Thinness5-9years -->[1;92m is clean [0m
21 . IncomeCompositionOfResources -->[1;92m is clean [0m
22 . Schooling -->[1;92m is clean [0m


# 3. Data Preparation

### 3.1. StringIndexer (Transforming categorical to numeric)

In [17]:
from pyspark.ml.feature import StringIndexer

In [18]:
status_index = StringIndexer()\
.setInputCol("Status")\
.setOutputCol("Status_Index")

### 3.2. OneHotEncoderEstimator (Indexing categorical values)

In [19]:
from pyspark.ml.feature import OneHotEncoderEstimator

In [20]:
encoder = OneHotEncoderEstimator()\
.setInputCols(["Status_Index"])\
.setOutputCols(["Status_Encoded"])

### 3.3. VectorAssembler (Vectorizing attributes)

In [21]:
'''
before_modelling = [ "Year", "AdultMortality", "InfantDeaths", "Alcohol", 
                  "PercentageExpenditure", "HepatitisB", "Measles", "BMI",
                  "UnderFiveDeaths", "Polio", "TotalExpenditure", "Diphtheria", 
                  "HIV_AIDS", "GDP", "Population", "Thinness1-19years", "Thinness5-9years", 
                  "IncomeCompositionOfResources", "Schooling", "Status_Encoded"]
'''

attributes = [ "AdultMortality", "InfantDeaths", "Alcohol",                "HepatitisB", "Measles", "BMI",
              "UnderFiveDeaths", "Polio", "Diphtheria", 
              "HIV_AIDS", "GDP", "Thinness1-19years",  
              "IncomeCompositionOfResources", "Schooling", "Status_Encoded"]

In [22]:
from pyspark.ml.feature import VectorAssembler

In [23]:
vector_assember = VectorAssembler()\
.setInputCols(attributes)\
.setOutputCol("vectorized_features")

### 3.4. Normalization (Standardization using StandardScaler)

In [24]:
from pyspark.ml.feature import StandardScaler

In [25]:
scaler = StandardScaler()\
.setInputCol("vectorized_features")\
.setOutputCol("features")

### 3.5. Splitting dataset into Train-Test

In [26]:
train_df, test_df = imputed_df.randomSplit([0.8, 0.2], seed=142)

#### Cache() stores in memory, when you need to read dataset many time  from disc. Thus it does not need to read every time from disc.

In [27]:
train_df.cache()
test_df.cache()

DataFrame[Country: string, Year: int, Status: string, label: double, AdultMortality: int, InfantDeaths: int, Alcohol: double, PercentageExpenditure: double, HepatitisB: int, Measles: int, BMI: double, UnderFiveDeaths: int, Polio: int, TotalExpenditure: double, Diphtheria: int, HIV_AIDS: double, GDP: double, Population: double, Thinness1-19years: double, Thinness5-9years: double, IncomeCompositionOfResources: double, Schooling: double]

### 3.6.Defining of Linear Model (Standardization using StandardScaler)

In [28]:
from pyspark.ml.regression import LinearRegression

linear_regression = LinearRegression()\
.setFeaturesCol("vectorized_features")\
.setLabelCol("label")

### 3.7. Using of Pipeline

In [29]:
from pyspark.ml.pipeline import Pipeline

In [30]:
pipeline_object = Pipeline()\
.setStages([status_index, 
            encoder, 
            vector_assember, 
            linear_regression])

# 4. Model Tuning and Training
An important task in Machine Learning is model selection, or using data to find the best model or parameters for a given task which is called model tuning.

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

### paramGrid

In [32]:
param_grid = ParamGridBuilder()\
.addGrid(linear_regression.aggregationDepth, [2,5])\
.addGrid(linear_regression.elasticNetParam, [0.0, 0.03])\
.addGrid(linear_regression.epsilon, [1.35,1.55])\
.addGrid(linear_regression.maxIter, [10,25])\
.addGrid(linear_regression.regParam, [0.0, 0.02])\
.addGrid(linear_regression.solver, ["auto", "normal", "l-bfgs"])\
.addGrid(linear_regression.tol, [1.0E-6, 1.0E-4])\
.build()

### Cross-Validation
CrossValidator begins by splitting the dataset into a set of folds which are used as separate training and test datasets. E.g., with k=3 folds, CrossValidator will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. 

In [33]:
cv = CrossValidator() \
.setEstimator(pipeline_object) \
.setEvaluator(RegressionEvaluator()) \
.setEstimatorParamMaps(param_grid) \
.setNumFolds(3) \
.setParallelism(2)

### Training of train set
It takes about 5-6 minute (note: according to laptop performance)

In [34]:
cv_model = cv.fit(train_df)

### Testing of original and predicted values

In [35]:
cv_model.transform(test_df).select("label", "prediction").toPandas().head()

Unnamed: 0,label,prediction
0,56.2,61.575387
1,57.3,58.737147
2,58.1,60.198686
3,58.6,60.401875
4,72.6,73.357768


# 5. The Best Model Evaluation
Now here we evaluate our model and compare value according to previous prediction (14th exapmple)

In [36]:
best_model = cv_model.bestModel

In [37]:
type(best_model)

pyspark.ml.pipeline.PipelineModel

In [38]:
lr_model = best_model.stages[-1]

In [39]:
lr_model.coefficients

DenseVector([-0.0206, 0.0948, 0.0753, -0.0158, -0.0, 0.0404, -0.0708, 0.0306, 0.0449, -0.497, 0.0, -0.0817, 5.0373, 0.6306, -1.7942])

In [40]:
lr_model.intercept

57.58539600422563

In [41]:
lr_model.summary.r2

0.8220615768517484

In [42]:
lr_model.summary.rootMeanSquaredError

4.010559781735385

In [43]:
lr_model.explainParams().split("\n")

['aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2, current: 2)',
 'elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0, current: 0.0)',
 'epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. (default: 1.35, current: 1.35)',
 'featuresCol: features column name (default: features, current: vectorized_features)',
 'fitIntercept: whether to fit an intercept term (default: True)',
 'labelCol: label column name (default: label, current: label)',
 'loss: The loss function to be optimized. Supported options: squaredError, huber. (Default squaredError) (default: squaredError)',
 'maxIter: maximum number of iterations (>= 0) (default: 100, current: 25)',
 'predictionCol: prediction column name (default: prediction)',
 'regParam: regularization parameter (>= 0) (default: 0.0, current: 0.0)',
 'solver: The solver algorithm for optimization.