In [2]:
# lets create spark PySpark instance
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Classification").getOrCreate()
spark

In [3]:
# Importing some important functions
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler

In [37]:
#Reading the dataset
path = "../Datasets/"

df = spark.read.csv(path+"Toddler Autism dataset July 2018.csv", inferSchema=True, header=True)
df.limit(6).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes
5,6,1,1,0,0,1,1,1,1,1,1,21,8,m,black,no,no,family member,Yes


In [6]:
# printing schema
print(df.printSchema())

root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)

None


### Checking if the dataset is balanced between the two classes of the dependent variables

In [38]:
df.groupBy("Class/ASD Traits ").agg(count(df["Class/ASD Traits "]).alias("Class_Count")).show()

+-----------------+-----------+
|Class/ASD Traits |Class_Count|
+-----------------+-----------+
|               No|        326|
|              Yes|        728|
+-----------------+-----------+



In [None]:
# Data seems to be balanced here
# Dataset is unbalanced in case there are only 10 cases for one of the class or less than 1% of the whole 
#dataset

### Format Data

Mlib requires all input columns to be vectorized. We also need to renmae the dependent variable into label since that is what is expected for all MLlib applications.

In [39]:
# Taking the input columns
input_columns = df.columns # Collect the column names as a list
input_columns = input_columns[1:-1] # since we can remove Case_no and dependent variable

dependent_var = 'Class/ASD Traits ' # assigning the dependent variable name

In [40]:
#we need to reindex the dependent variable starting from zero

# renaming dependent variable to String DataType
renamed = df.withColumn('label_str', df[dependent_var].cast(StringType())) 

#Changing the column name to label which is expected by MLlib applications and changing the string to numeric
# starting from zero
indexer = StringIndexer(inputCol="label_str", outputCol="label")
# fit method will just calculate the label output columns and transform will apply those changes to the 
# renamed dataframe
indexed = indexer.fit(renamed).transform(renamed)

In [41]:
indexed.limit(4).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label
0,1,0,0,0,0,0,0,1,1,0,...,28,3,f,middle eastern,yes,no,family member,No,No,1.0
1,2,1,1,0,0,0,1,1,0,0,...,36,4,m,White European,yes,no,family member,Yes,Yes,0.0
2,3,1,0,0,0,0,0,1,1,0,...,36,4,m,middle eastern,yes,no,family member,Yes,Yes,0.0
3,4,1,1,1,1,1,1,1,1,1,...,24,10,m,Hispanic,no,no,family member,Yes,Yes,0.0


#### Converting all input data into numeric

In [42]:
# Converting all input data into numeric

# Creating a list of for numeric columns and string columns
numeric_inputs = []
string_inputs = []

# looping through each column to check if that is of String type or integer type
for column in input_columns:
    # checking for string type
    if str(indexed.schema[column].dataType) == "StringType":
        #print("Column ",column," is of String Type")
        # Setting up the StringIndexer function, and chaning the name of the new column
        indexer = StringIndexer(inputCol=column, outputCol=column+"_num")
        # calling fit and transform method to make this change in the dataframe
        indexed = indexer.fit(indexed).transform(indexed)
        #renaming the column to a new column so that it can be distinguishable from the original
        new_col_name = column+"_num"
        #Add the new column in the list
        string_inputs.append(new_col_name)
    
    else:
        # in case of numeric column, just add it to the list
        #print("Column ",column," is of Integer Type")
        numeric_inputs.append(column)

In [43]:
print(string_inputs)
print(numeric_inputs)

['Sex_num', 'Ethnicity_num', 'Jaundice_num', 'Family_mem_with_ASD_num', 'Who completed the test_num']
['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons', 'Qchat-10-Score']


In [44]:
print(indexed.printSchema())

root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)
 |-- label_str: string (nullable = true)
 |-- label: double (nullable = false)
 |-- Sex_num: double (nullable = false)
 |-- Ethnicity_num: double (nullable = false)
 |-- Jaundice_num: double (nullable = false)
 |-- Family_mem_with_ASD_num: double

### Treating for skewness and Outliers

In [62]:
# Flooring and capping
# Plus if right skewed, the take log+1
# if left skewed, do exp transformation

# create a empty dictionary
d = {}

# doing the top and bottom 1%
for col in numeric_inputs:
    # this dictionary will store the top and bottom 1% quantiles for each numeric column
    d[col] = indexed.approxQuantile(col, [0.01, 0.99], 0.25)

# now check for skewness for all numeric cols
for col in numeric_inputs:
    # collecting the skewness for each numeric column
    # skew is a list 
    skew = indexed.agg(skewness(indexed[col])).collect()
    skew = skew[0][0]
    
    # if skewness is found, below code will make the necessary changes
    if skew >1: # if right skew, floor, cap and log(x+1)
        indexed = indexed.withColumn(col, \
        log(when(indexed[col] < d[col][0], d[col][0]) \
           .when(indexed[col] > d[col][1], d[col][1]) \
           .otherwise(indexed[col]) +1).alias(col))
        print(col+" has been treated for positive (right) skewness. (skew=)", skew, ")")
    elif skew < -1: #if left skew, floor, cap, and exp(x)
        indexed = indexed.withColumn(col, \
        exp(when(indexed[col] < d[col][0], d[col][0]) \
           .when(indexed[col] > d[col][1], d[col][1]) \
           .otherwise(indexed[col])).alias(col))
        print(col+" has been treated for negative (left )skewness, (skew=)", skew,")")

There is no issue of skewness in the dataset

### Check for the negative values in the dataset.

We need to check only the original numeric columns since indexed column (new numeric columns) won't have any negative values in them


In [70]:
# Calculate the mins for all columns in the dataset
minimums = df.select([min(c).alias(c) for c in indexed.columns if c in numeric_inputs])
# Create an array for all mins and select only the input cols
min_array = minimums.select(array(numeric_inputs).alias("mins"))
# Collect the global minimum as Python object
df_minimum = min_array.select(array_min(min_array.mins)).collect()
# get the global minimum
df_minimum = df_minimum[0][0]

# If there is any Negative values found in the df, print a warning message
if df_minimum < 0:
    print("WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contain negative values")
else:
    print("No negative values were found in your dataframe")

No negative values were found in your dataframe


In [72]:
# Before we correct any negative values that may have been found above, we need to vectorize the dataframe
# because the function that we will be using to that correction requires a vector
# Now create your final features list

features_list = numeric_inputs + string_inputs
# Create your vector assembler object
assembler = VectorAssembler(inputCols=features_list, outputCol='features')
# And Call on the vector assembler to transform the dataframe
output = assembler.transform(indexed).select('features', 'label')

In [76]:
output.show(20, False)

+-------------------------------------------------------------------------+-----+
|features                                                                 |label|
+-------------------------------------------------------------------------+-----+
|(17,[6,7,9,10,11,12,13,14],[1.0,1.0,1.0,28.0,3.0,1.0,2.0,1.0])           |1.0  |
|(17,[0,1,5,6,10,11,14],[1.0,1.0,1.0,1.0,36.0,4.0,1.0])                   |0.0  |
|(17,[0,6,7,9,10,11,13,14],[1.0,1.0,1.0,1.0,36.0,4.0,2.0,1.0])            |0.0  |
|[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,24.0,10.0,0.0,5.0,0.0,0.0,0.0]  |0.0  |
|[1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,9.0,1.0,0.0,0.0,1.0,0.0]   |0.0  |
|[1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0,8.0,0.0,4.0,0.0,0.0,0.0]   |0.0  |
|(17,[0,3,4,5,8,10,11,13,14],[1.0,1.0,1.0,1.0,1.0,33.0,5.0,1.0,1.0])      |0.0  |
|(17,[1,4,6,7,8,9,10,11,13,14],[1.0,1.0,1.0,1.0,1.0,1.0,33.0,6.0,1.0,1.0])|0.0  |
|(17,[6,9,10,11,13],[1.0,1.0,36.0,2.0,1.0])                               |1.0  |
|[1.0,1.0,1.0,0.

In [80]:
# Creating a min,max scalar object
# we can perform scaling on the dataframe, this will fix the negative value issue if there is any in the
# dataframe
# let's take the range from 0 to 1000
scalar = MinMaxScaler(inputCol="features", outputCol="scaledFeatures", min=0, max=1000)
print("Features scaled to range: [%f, %f]" % (scalar.getMin(), scalar.getMax()))

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scalar.fit(output)
# transform each feature according to min, max range
scaled_data = scalerModel.transform(output)
#scaled_data.show(5)
# selecting only the label and new scaled features from the scaled_data dataframe
final_data = scaled_data.select('scaledFeatures', 'label')

# Rename the scaledFeature to its default name
final_data = final_data.withColumnRenamed("scaledFeatures", "features")
final_data.show()

Features scaled to range: [0.000000, 1000.000000]
+--------------------+-----+--------------------+
|            features|label|      scaledFeatures|
+--------------------+-----+--------------------+
|(17,[6,7,9,10,11,...|  1.0|[0.0,0.0,0.0,0.0,...|
|(17,[0,1,5,6,10,1...|  0.0|[1000.0,1000.0,0....|
|(17,[0,6,7,9,10,1...|  0.0|[1000.0,0.0,0.0,0...|
|[1.0,1.0,1.0,1.0,...|  0.0|[1000.0,1000.0,10...|
|[1.0,1.0,0.0,1.0,...|  0.0|[1000.0,1000.0,0....|
+--------------------+-----+--------------------+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,0.0,0.0,0...|  0.0|
|[1000.0,1000.0,10...|  0.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,0.0,0.0,1...|  0.0|
|[0.0,1000.0,0.0,0...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,10...|  0.0|
|[1000.0,0.0,0.0,1...|  0.0|
|[1000.0,1000.0,10...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,10

In [81]:
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,0.0,0.0,0...|  0.0|
|[1000.0,1000.0,10...|  0.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,1000.0,0....|  0.0|
|[1000.0,0.0,0.0,1...|  0.0|
|[0.0,1000.0,0.0,0...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,10...|  0.0|
|[1000.0,0.0,0.0,1...|  0.0|
|[1000.0,1000.0,10...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,10...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,10...|  0.0|
|[0.0,0.0,0.0,0.0,...|  1.0|
|[1000.0,1000.0,10...|  0.0|
|[1000.0,0.0,0.0,0...|  1.0|
|[1000.0,1000.0,10...|  0.0|
+--------------------+-----+
only showing top 20 rows



#### Split data into Training and Test Datasets

In [82]:
# splitting the data randomly in 70:30 ratio
train, test = final_data.randomSplit([0.7, 0.3])

In [84]:
print("total number of records in training dataset: ",train.count())
print("total number of records in testing dataset: ", test.count())

total number of records in training dataset:  741
total number of records in testing dataset:  313


In [85]:
# Importing the dependencies
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql.functions import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [86]:
# Setting up our evaluation objects
# this is used when we have binary classification problem such as ours
Bin_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
# below evaluation is mainly used for multiclass classification problem but can also be used and checked
# for binary classification problem
MC_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [91]:
# Running Logistic Regression Model
classifier = LogisticRegression()
# fitting the model with training data
fitModel = classifier.fit(train)

#Evaluation method for binary classification problem
predictionAndLabels = fitModel.transform(test)
predictionAndLabels.show(50)

auc = Bin_evaluator.evaluate(predictionAndLabels)
print("AUC: ",auc)

# Evaluation for a multiclass classification problems
accuracy = (MC_evaluator.evaluate(predictionAndLabels))*100
print("Accuracy: {0:.2f}".format(accuracy),"%")
print(" ")

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.64407953386...|[2.00474222782246...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.77377908621...|[1.76088396439985...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.33564583937...|[2.72903804410540...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.33564583937...|[2.72903804410540...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.32819268224...|[2.74945398054480...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-123.43548930532...|[2.46972230293454...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-122.26320731887...|[7.97560655480689...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-122.83847243109...|[4.48672299726888...|       1.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[-121.71777744895...|[1.37607119638720...|       1.0|
|[0.0,0.0,0.0,0.

In [93]:
# printing coefficients and intercepts for the Logistic Regression Model
print("Intercept: " + str(fitModel.interceptVector))
print("Coefficients: \n" + str(fitModel.coefficientMatrix))

Intercept: [123.77857371205951]
Coefficients: 
DenseMatrix([[-0.02735478, -0.02672313, -0.02633224, -0.02777293, -0.02653522,
              -0.02699613, -0.02681948, -0.02591789, -0.0269732 , -0.02650024,
              -0.00135316, -0.08128871,  0.00026419, -0.00134494, -0.00092982,
              -0.00029817, -0.0017696 ]])


In [107]:
# representing the coefficients with the predictors
# we will make a dataframe out of this to better see the coefficients along with the corresponding predictors
# convert the coefficients score from array to a list
coeff_array = fitModel.coefficientMatrix.toArray()
coeff_score = [] # creating an empty list

# checking each coefficients and appending them into a list
for x in coeff_array[0]:
    coeff_score.append(float(x))

# Create a dataframe
result = spark.createDataFrame(zip(features_list,coeff_score), schema=['feature', 'coeff'])
result.show()

+--------------------+--------------------+
|             feature|               coeff|
+--------------------+--------------------+
|                  A1|-0.02735477980626631|
|                  A2|-0.02672312961569...|
|                  A3|-0.02633224231177893|
|                  A4|-0.02777292594537...|
|                  A5|-0.02653521991148...|
|                  A6|-0.02699613317549738|
|                  A7|-0.02681947765624...|
|                  A8|-0.02591789414214...|
|                  A9|-0.02697320473547904|
|                 A10|-0.02650023763011...|
|            Age_Mons|-0.00135315912357...|
|      Qchat-10-Score|-0.08128871161811413|
|             Sex_num|2.641937305502643...|
|       Ethnicity_num|-0.00134494178197...|
|        Jaundice_num|-9.29819173799127...|
|Family_mem_with_A...|-2.98172431423883...|
|Who completed the...|-0.00176959932155...|
+--------------------+--------------------+



In [108]:
result.show(20, False)

+--------------------------+----------------------+
|feature                   |coeff                 |
+--------------------------+----------------------+
|A1                        |-0.02735477980626631  |
|A2                        |-0.026723129615697533 |
|A3                        |-0.02633224231177893  |
|A4                        |-0.027772925945378862 |
|A5                        |-0.026535219911480435 |
|A6                        |-0.02699613317549738  |
|A7                        |-0.026819477656247075 |
|A8                        |-0.025917894142145046 |
|A9                        |-0.02697320473547904  |
|A10                       |-0.026500237630119588 |
|Age_Mons                  |-0.0013531591235770545|
|Qchat-10-Score            |-0.08128871161811413  |
|Sex_num                   |2.6419373055026435E-4 |
|Ethnicity_num             |-0.0013449417819735722|
|Jaundice_num              |-9.298191737991273E-4 |
|Family_mem_with_ASD_num   |-2.981724314238838E-4 |
|Who complet

### Checking the Accuracy of the model with Cross Validation

In [111]:
# first we need to the classifier we need to use
classifier = LogisticRegression()

# Setting up the parameter grid for cross validator to conduct hyperparameter tuning
paramGrid = (ParamGridBuilder().addGrid(classifier.maxIter, [10, 15, 20]).build())
paramGrid
# Setting up the Cross Validator 
crossval = CrossValidator(estimator= classifier,
                         estimatorParamMaps=paramGrid,
                         evaluator = MC_evaluator,
                         numFolds = 4)

# fit the model
fitModel1 = crossval.fit(train)

BestModel = fitModel1.bestModel
print("Intercept: " + str(BestModel.interceptVector))
print("Coefficients: \n" + str(BestModel.coefficientMatrix))

print(BestModel)

# we don't need to use BestModel, fitModel1 automatically use best model, hence we can directly use
# fitModel1 on the test data
predictions = fitModel1.transform(test)

# Checking accuracy of the model
accuracy = (MC_evaluator.evaluate(predictions))*100
print(accuracy)

Intercept: [34.330061127051316]
Coefficients: 
DenseMatrix([[-7.98130008e-03, -7.82444281e-03, -6.89856891e-03,
              -6.80761477e-03, -6.91467163e-03, -7.53457683e-03,
              -7.58818825e-03, -7.47400471e-03, -7.22033755e-03,
              -7.46764240e-03,  4.76900044e-04, -2.23545278e-02,
              -1.30667153e-04, -1.11644272e-03,  1.09032197e-05,
              -1.76942754e-04, -2.44335744e-03]])
LogisticRegressionModel: uid = LogisticRegression_9d4be74c11ef, numClasses = 2, numFeatures = 17
100.0
