# Santander Customer Satisfaction
## Project using Pyspark 

In [1]:
import findspark
findspark.init()

## Imports of frameworks and functions 

In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col
from pyspark.sql.types import NumericType,IntegerType
import numpy as np 

In [3]:
#Creating a SparkContex - It is an entry point to Spark and Pyspark before create a SparkSession ,
# SparkContext is the first step to use RDD and connect to Spark Cluster

sc = SparkContext(appName= "Santander")

In [4]:
#SparkSession is a entry point to Spark to underlying Spark functionality in order to programmatically
#Create RDD , DataFrame and Datasets
sps = SparkSession.builder.master("local").getOrCreate()

In [5]:
#Loading the dataset in a RDD format
rdd = sc.textFile('dados/train.csv')

In [7]:
rdd.take(5)

['ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,imp_op_var39_ult1,imp_sal_var16_ult1,ind_var1_0,ind_var1,ind_var2_0,ind_var2,ind_var5_0,ind_var5,ind_var6_0,ind_var6,ind_var8_0,ind_var8,ind_var12_0,ind_var12,ind_var13_0,ind_var13_corto_0,ind_var13_corto,ind_var13_largo_0,ind_var13_largo,ind_var13_medio_0,ind_var13_medio,ind_var13,ind_var14_0,ind_var14,ind_var17_0,ind_var17,ind_var18_0,ind_var18,ind_var19,ind_var20_0,ind_var20,ind_var24_0,ind_var24,ind_var25_cte,ind_var26_0,ind_var26_cte,ind_var26,ind_var25_0,ind_var25,ind_var27_0,ind_var28_0,ind_var28,ind_var27,ind_var29_0,ind_var29,ind_var30_0,ind_var30,ind_var31_0,ind_var31,ind_var32_cte,ind_var32_0,ind_var32,ind_var33_0,ind_var3

In [8]:
# Let´s split the datas by comma in this way We can manipulate the data easily.
rdd = rdd.map(lambda x: x.split(","))

In [9]:
schema = rdd.first()

In [10]:
# Selecting the header to deal just with numbers and to be possible create a dataframe with the names of columns.
rdd_1 = rdd.filter(lambda x: schema !=x)

In [11]:
#Creating a dataframe Spark
data_frame = sps.createDataFrame(rdd_1,schema)

In [12]:
data_frame.show()

+---+----+-----+------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------+------------------+----------+--------+----------+--------+----------+--------+----------+--------+----------+--------+-----------+---------+-----------+-----------------+---------------+-----------------+---------------+-----------------+---------------+---------+-----------+---------+-----------+---------+-----------+---------+---------+-----------+---------+-----------+---------+-------------+-----------+-------------+---------+-----------+---------+-----------+-----------+---------+---------+-----------+---------+-----------+---------+-----------+---------+-------------+-----------+---------+-----------+--------

In [13]:
# Transforming to Pandas 
data_frame.toPandas().head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016494,0


In [14]:
# Analising these  datas we can note that there are a lot columns , then we need analise that, some columns just have one value
# This type of columns is not usefull for apply machine learnig and we can delete these columns.
unique_variable = data_frame.toPandas().nunique()

In [15]:
columns_unique  = tuple(unique_variable[unique_variable==1].index)

In [16]:
columns_unique

('ind_var2_0',
 'ind_var2',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var41',
 'ind_var46_0',
 'ind_var46',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var41',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var41',
 'saldo_var46',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var33_hace3',
 'imp_trasp_var17_out_hace3',
 'imp_trasp_var33_out_hace3',
 'num_var2_0_ult1',
 'num_var2_ult1',
 'num_reemb_var13_hace3',
 'num_reemb_var33_hace3',
 'num_trasp_var17_out_hace3',
 'num_trasp_var33_out_hace3',
 'saldo_var2_ult1',
 'saldo_medio_var13_medio_hace3')

In [17]:
# Deleting the columns with unique values

df = data_frame.drop(*columns_unique)

In [18]:
df.toPandas().shape

(76020, 337)

In [19]:
# For work with mllib(Machine Learning in Pyspark) we need transform our RDD or DataFrame in two objects one is the label(Target)
# other is a Vector dense with the variables that are usefull to build the prediction.
# Let´s build a fuction to help us transform the datas.

def var(row):
    obj = (row['TARGET'],Vectors.dense(row[1:336]))
    return obj

In [20]:
# Transforming dataframe in RDD to manipule easily.

rdd_2 = df.rdd

In [21]:
rdd_2.take(5)

[Row(ID='1', var3='2', var15='23', imp_ent_var16_ult1='0', imp_op_var39_comer_ult1='0', imp_op_var39_comer_ult3='0', imp_op_var40_comer_ult1='0', imp_op_var40_comer_ult3='0', imp_op_var40_efect_ult1='0', imp_op_var40_efect_ult3='0', imp_op_var40_ult1='0', imp_op_var41_comer_ult1='0', imp_op_var41_comer_ult3='0', imp_op_var41_efect_ult1='0', imp_op_var41_efect_ult3='0', imp_op_var41_ult1='0', imp_op_var39_efect_ult1='0', imp_op_var39_efect_ult3='0', imp_op_var39_ult1='0', imp_sal_var16_ult1='0', ind_var1_0='0', ind_var1='0', ind_var5_0='1', ind_var5='0', ind_var6_0='0', ind_var6='0', ind_var8_0='0', ind_var8='0', ind_var12_0='0', ind_var12='0', ind_var13_0='0', ind_var13_corto_0='0', ind_var13_corto='0', ind_var13_largo_0='0', ind_var13_largo='0', ind_var13_medio_0='0', ind_var13_medio='0', ind_var13='0', ind_var14_0='0', ind_var14='0', ind_var17_0='0', ind_var17='0', ind_var18_0='0', ind_var18='0', ind_var19='0', ind_var20_0='0', ind_var20='0', ind_var24_0='0', ind_var24='0', ind_var25

In [22]:
# Applying our fuction using map
vector_dense = rdd_2.map(var)

In [23]:
vector_dense.collect()

[('0',
  DenseVector([2.0, 23.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 3.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 99.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [24]:
type(vector_dense)

pyspark.rdd.PipelinedRDD

In [32]:
# Creating again a dataframe with Label(Target) and Features.
df = sps.createDataFrame(vector_dense,['label','features'])

In [33]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[2.0,23.0,0.0,0.0...|
|    0|[2.0,34.0,0.0,0.0...|
|    0|[2.0,23.0,0.0,0.0...|
|    0|[2.0,37.0,0.0,195...|
|    0|[2.0,39.0,0.0,0.0...|
|    0|[2.0,23.0,0.0,0.0...|
|    0|[2.0,27.0,0.0,0.0...|
|    0|[2.0,26.0,0.0,0.0...|
|    0|[2.0,45.0,0.0,0.0...|
|    0|[2.0,25.0,0.0,0.0...|
|    0|[2.0,42.0,0.0,0.0...|
|    0|[2.0,26.0,0.0,0.0...|
|    0|[2.0,51.0,0.0,0.0...|
|    0|[2.0,43.0,0.0,0.0...|
|    0|[2.0,33.0,600.0,1...|
|    0|[2.0,30.0,0.0,0.0...|
|    0|[2.0,44.0,0.0,0.0...|
|    0|[2.0,36.0,0.0,55....|
|    0|[229.0,55.0,0.0,0...|
|    0|[2.0,28.0,0.0,0.0...|
+-----+--------------------+
only showing top 20 rows



# Importing functions to work with machine learnig 

In [34]:
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer

In [35]:
# Using StandardScaler to standardize the datas.

scaler = StandardScaler(inputCol = 'features',outputCol = 'scaled',withStd = True)

In [36]:
model = scaler.fit(df)

In [37]:
df_scaled = model.transform(df)

In [38]:
df_scaled.show()

+-----+--------------------+--------------------+
|label|            features|              scaled|
+-----+--------------------+--------------------+
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,34.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,37.0,0.0,195...|[5.12380885240551...|
|    0|[2.0,39.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,27.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,26.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,45.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,25.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,42.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,26.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,51.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,43.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,33.0,600.0,1...|[5.12380885240551...|
|    0|[2.0,30.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,44.0,0.0,0.0...|[5.12380885240551...|


In [39]:
# We group the column Label to count how much variables are 0 and 1 , we can note that the dataset are unbalanced 
# It can interfere in our machhine learning algorithm so we need find a way to equalize the quantite of variables with 0 and 1 . 
df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|73012|
|    1| 3008|
+-----+-----+



In [45]:
df_label1 = df_scaled.filter('label=1')
df_label1.show()

+-----+--------------------+--------------------+
|label|            features|              scaled|
+-----+--------------------+--------------------+
|    1|[2.0,66.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,45.0,0.0,26....|[5.12380885240551...|
|    1|[2.0,42.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,31.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,62.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,24.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,57.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,32.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,28.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,30.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,44.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,36.0,0.0,13....|[5.12380885240551...|
|    1|[2.0,38.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,36.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,34.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,51.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,27.0,0.0,0.0...|[5.12380885240551...|


 # Treatment of data
 
### We can note that the Target variable is unbalanced , then let´s balance the dataset We will apply a filter and after a sample to select around 2x the variable with target 1 .

In [46]:
df_label2 = df_scaled.filter('label=0')
df_label2.show()

+-----+--------------------+--------------------+
|label|            features|              scaled|
+-----+--------------------+--------------------+
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,34.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,37.0,0.0,195...|[5.12380885240551...|
|    0|[2.0,39.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,27.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,26.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,45.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,25.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,42.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,26.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,51.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,43.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,33.0,600.0,1...|[5.12380885240551...|
|    0|[2.0,30.0,0.0,0.0...|[5.12380885240551...|
|    0|[2.0,44.0,0.0,0.0...|[5.12380885240551...|


In [47]:
df_train = df_label1.union(df_label2)

In [48]:
df_train.toPandas().shape

(76020, 3)

In [49]:
df_train.schema

StructType([StructField('label', StringType(), True), StructField('features', VectorUDT(), True), StructField('scaled', VectorUDT(), True)])

 Let´s spliting the datas in train and test 

In [50]:
(training_data,test_data) = df_train.randomSplit([0.5,0.5])

In [51]:
training_data.show()

+-----+--------------------+--------------------+
|label|            features|              scaled|
+-----+--------------------+--------------------+
|    1|[1.0,43.0,0.0,0.0...|[2.56190442620275...|
|    1|[1.0,58.0,0.0,0.0...|[2.56190442620275...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|


 Label variable is string type them we need change the data type of this variables let´s change for a integer type 

In [52]:
training_data.schema

StructType([StructField('label', StringType(), True), StructField('features', VectorUDT(), True), StructField('scaled', VectorUDT(), True)])

In [53]:
training_data = training_data.withColumn("label",col("label").cast(IntegerType()))

In [54]:
training_data = training_data.drop("features")

In [55]:
training_data = training_data.withColumnRenamed("scaled","features")

In [56]:
training_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[2.56190442620275...|
|    1|[2.56190442620275...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
+-----+--------------------+
only showing top 20 rows



String Indexer is a requirement of algorithm of machine learning 

In [57]:
stringIndex = StringIndexer(inputCol = "label",outputCol = "label_indexed")

In [58]:
si_model = stringIndex.fit(training_data)

In [59]:
obj_final = si_model.transform(training_data)
obj_final.collect()

[Row(label=1, features=DenseVector([0.0, 3.3188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.9867, 2.1167, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14.9209, 2.26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0857, 0.0, 0.0, 3.0694, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0994, 4.5698, 2.0951, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.2444, 1.8262, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0459, 0.0, 0.0, 0.0, 2.6335, 0.0, 0.0, 2.7142, 0.0, 3.1776, 2.0031, 0.0, 0.0, 0.0, 0.0457, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0092, 0.0, 0.0211, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [60]:
# Creating a model of Decision Tree Classifier

dt_1 = DecisionTreeClassifier(labelCol ="label_indexed",featuresCol ='features')

In [61]:
model_1 = dt_1.fit(obj_final)

In [62]:
prev_1 = model_1.transform(obj_final)

In [63]:
prev_1.show()

+-----+--------------------+-------------+--------------+--------------------+----------+
|label|            features|label_indexed| rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------+--------------------+----------+
|    1|[2.56190442620275...|          1.0|[9638.0,196.0]|[0.98006914785438...|       0.0|
|    1|[2.56190442620275...|          1.0|[4832.0,285.0]|[0.94430330271643...|       0.0|
|    1|[5.12380885240551...|          1.0|  [733.0,41.0]|[0.94702842377260...|       0.0|
|    1|[5.12380885240551...|          1.0|  [733.0,41.0]|[0.94702842377260...|       0.0|
|    1|[5.12380885240551...|          1.0| [4643.0,77.0]|[0.98368644067796...|       0.0|
|    1|[5.12380885240551...|          1.0|  [733.0,41.0]|[0.94702842377260...|       0.0|
|    1|[5.12380885240551...|          1.0|  [733.0,41.0]|[0.94702842377260...|       0.0|
|    1|[5.12380885240551...|          1.0|  [733.0,41.0]|[0.94702842377260...|       0.0|
|    1|[5.

In [65]:
test_data.show()

+-----+--------------------+--------------------+
|label|            features|              scaled|
+-----+--------------------+--------------------+
|    1|[0.0,30.0,0.0,0.0...|[0.0,2.3154426613...|
|    1|[1.0,39.0,0.0,0.0...|[2.56190442620275...|
|    1|[1.0,49.0,0.0,0.0...|[2.56190442620275...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|
|    1|[2.0,23.0,0.0,0.0...|[5.12380885240551...|


In [67]:
test = test_data.select("label","scaled")

In [68]:
test = test.withColumnRenamed("scaled","features")
test.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[0.0,2.3154426613...|
|    1|[2.56190442620275...|
|    1|[2.56190442620275...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
|    1|[5.12380885240551...|
+-----+--------------------+
only showing top 20 rows



In [69]:
test = test.withColumn("label",col("label").cast(IntegerType()))

In [70]:
prev_test = model_1.transform(test)

In [74]:
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label",metricName = "accuracy")

In [75]:
# Evaluating the performance of Decision Tree
evaluator.evaluate(prev_test)

0.960266645587964

In [73]:
# Confusion Matrix Decission Tree
prev_test.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1501|
|    1|       1.0|    4|
|    0|       0.0|36441|
|    0|       1.0|    7|
+-----+----------+-----+



In [77]:
# Aplying Logistic regression 
lr = LogisticRegression(labelCol = "label_indexed", featuresCol = "features")

In [78]:
model_2 = lr.fit(obj_final)

In [79]:
prev_2 = model_2.transform(test)

In [80]:
prev_2.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    1|[0.0,2.3154426613...|[2.20642051342941...|[0.90082459526723...|       0.0|
|    1|[2.56190442620275...|[2.30017105640482...|[0.90889120481171...|       0.0|
|    1|[2.56190442620275...|[1.12488809574013...|[0.75489428193556...|       0.0|
|    1|[5.12380885240551...|[2.54212581141327...|[0.92704273536224...|       0.0|
|    1|[5.12380885240551...|[2.59566984837731...|[0.93058237799388...|       0.0|
|    1|[5.12380885240551...|[2.67446453913220...|[0.93550293348877...|       0.0|
|    1|[5.12380885240551...|[2.67836627288669...|[0.93573795351080...|       0.0|
|    1|[5.12380885240551...|[2.68340456340082...|[0.93604025389005...|       0.0|
|    1|[5.12380885240551...|[2.69780923363810...|[0.93689724799646...|       0.0|
|    1|[5.123808

In [81]:
# Evaluating Logistic Regression 
evaluator.evaluate(prev_2)

0.9592917555924433

In [82]:
# Confusion Matriz of Logistic Regression
prev_2.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1497|
|    1|       1.0|    8|
|    0|       0.0|36400|
|    0|       1.0|   48|
+-----+----------+-----+

