# Regression and Classification with Pyspark ML

## Linear Regression and Random Forest/GBT Classification with Pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StringType,IntegerType,StructType, DoubleType, FloatType
from pyspark.sql.functions import *

data_schema = [
StructField("_c0", IntegerType(), True)
,StructField("province", StringType(), True)
,StructField("specific", DoubleType(), True)
,StructField("general", DoubleType(), True)
,StructField("year", IntegerType(), True)
,StructField("gdp", FloatType(), True)
,StructField("fdi", FloatType(), True)
,StructField("rnr", DoubleType(), True)
,StructField("rr", FloatType(), True)
,StructField("i", FloatType(), True)
,StructField("fr", IntegerType(), True)
,StructField("reg", StringType(), True)
,StructField("it", IntegerType(), True)
]

final_struc = StructType(fields=data_schema)

file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").schema(final_struc).option("header", True).load(file_location)

#df.printSchema()

df.show()

In [3]:
df.groupBy('province').count().show()

## Imputation of mean values to prepare the data

In [4]:
mean_val = df.select(mean(df['general'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["general"])

In [5]:
mean_val = df.select(mean(df['specific'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["specific"])

In [6]:
mean_val = df.select(mean(df['rr'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["rr"])

In [7]:
mean_val = df.select(mean(df['fr'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["fr"])

In [8]:
mean_val = df.select(mean(df['rnr'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["rnr"])

In [9]:
mean_val = df.select(mean(df['i'])).collect()
mean_val[0][0]
mean_gen = mean_val[0][0]
df = df.na.fill(mean_gen,["i"])

## Creating binary target feature from extant column for classification 

In [10]:
from pyspark.sql.functions import *
df = df.withColumn('specific_classification',when(df.specific >= 583470.7303370787,1).otherwise(0))

## Using StringIndexer for categorical encoding of string type columns

In [11]:
from pyspark.ml.feature import StringIndexer

In [12]:
indexer = StringIndexer(inputCol="province", outputCol="provinceIndex")
df = indexer.fit(df).transform(df)

In [13]:
indexer = StringIndexer(inputCol="reg", outputCol="regionIndex")
df = indexer.fit(df).transform(df)

In [14]:
df.show()

## Using VectorAssembler to prepare features for machine learning

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [16]:
df.columns

In [17]:
assembler = VectorAssembler(
 inputCols=[
 'provinceIndex',
# 'specific',
 'general',
 'year',
 'gdp',
 'fdi',
 #'rnr',
 #'rr',
 #'i',
 #'fr',
 'regionIndex',
 'it'
 ],
 outputCol="features")

In [18]:
output = assembler.transform(df)

In [19]:
final_data = output.select("features", "specific")

## Spliting data into train and test

In [20]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

## Regression with Pyspark ML

In [21]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='specific')

## Fitting the linear regression model to the training data

In [22]:
lrModel = lr.fit(train_data)

## Coefficients and Intercept of the linear regression model

In [23]:
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

## Evaluating trained linear regression model on the test data

In [24]:
test_results = lrModel.evaluate(test_data)

## Metrics of trained linear regression model on the test data (RMSE, MSE, R2)

In [25]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

## Looking at correlations with corr

In [26]:
from pyspark.sql.functions import corr

In [27]:
df.select(corr('specific','gdp')).show()

## Classification with Pyspark ML

In [28]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

## DecisionTreeClassifier, RandomForestClassifier and GBTClassifier

In [29]:
dtc = DecisionTreeClassifier(labelCol='specific_classification',featuresCol='features')
rfc = RandomForestClassifier(labelCol='specific_classification',featuresCol='features')
gbt = GBTClassifier(labelCol='specific_classification',featuresCol='features')

## Selecting features and binary target

In [30]:
final_data = output.select("features", "specific_classification")
train_data,test_data = final_data.randomSplit([0.7,0.3])

## Fitting the Classifiers to the Training Data

In [31]:
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_model = dtc.fit(train_data)

## Classifier predictions on test data

In [32]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

## Evaluating Classifiers using pyspark.ml.evaluation and MulticlassClassificationEvaluator

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Classifier Accuracy

In [34]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="specific_classification", predictionCol="prediction", metricName="accuracy")

## Classifier Accuracy Metrics

In [35]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [36]:
print('-'*80)
print('Decision tree accuracy: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('Random forest ensemble accuracy: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('GBT accuracy: {0:2.2f}%'.format(gbt_acc*100))
print('-'*80)

## Classification Correlation with Corr

In [37]:
df.select(corr('specific_classification','fdi')).show()

In [38]:
df.select(corr('specific_classification','gdp')).show()

## Footnotes

This post includes code adapted from [Spark and Python for Big Data udemy course](https://udemy.com/course/spark-and-python-for-big-data-with-pyspark) and [Spark and Python for Big Data notebooks](https://github.com/SuperJohn/spark-and-python-for-big-data-with-pyspark).

