In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/21 17:34:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
22/02/21 17:34:48 WARN Utils: Service 'SparkUI' could

In [2]:
def toDoubleSafe(v):
    try:
        return float(v)
    except:
        return str(v) #if it is not a float type return as a string.

## Create an RDD

In [3]:
#load and convert the data
census_raw = sc.textFile("../Data/adult.raw", 4).map(lambda x:  x.split(", "))
census_raw = census_raw.map(lambda row:  [toDoubleSafe(x) for x in row])

## Convert the RDD to DataFrame.


In [4]:
from pyspark.sql.types import *
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])


In [5]:
dfraw = ss.createDataFrame(census_raw, adultschema)

In [6]:
dfraw.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+----+----------------+--------+---------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|education|      marital_status|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+---------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|Bachelors|       Never-married|     Adm-clerical|Not-in-family|White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|Bachelors|  Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|  HS-grad|            Divorced|Handlers-cleaners|Not-in-family|White|  Male|         0.0|         0.0|      

                                                                                

In [7]:
#Check the most commonly used vals.
dfraw.groupBy(dfraw["workclass"]).count().orderBy("count",ascending=False).show()
dfraw.groupBy(dfraw["occupation"]).count().orderBy("count",ascending=False).show()
dfraw.groupBy(dfraw["native_country"]).count().orderBy("count",ascending=False).show()

+----------------+-----+
|       workclass|count|
+----------------+-----+
|         Private|33906|
|Self-emp-not-inc| 3862|
|       Local-gov| 3136|
|               ?| 2799|
|       State-gov| 1981|
|    Self-emp-inc| 1695|
|     Federal-gov| 1432|
|     Without-pay|   21|
|    Never-worked|   10|
+----------------+-----+

+-----------------+-----+
|       occupation|count|
+-----------------+-----+
|   Prof-specialty| 6172|
|     Craft-repair| 6112|
|  Exec-managerial| 6086|
|     Adm-clerical| 5611|
|            Sales| 5504|
|    Other-service| 4923|
|Machine-op-inspct| 3022|
|                ?| 2809|
| Transport-moving| 2355|
|Handlers-cleaners| 2072|
|  Farming-fishing| 1490|
|     Tech-support| 1446|
|  Protective-serv|  983|
|  Priv-house-serv|  242|
|     Armed-Forces|   15|
+-----------------+-----+

+------------------+-----+
|    native_country|count|
+------------------+-----+
|     United-States|43832|
|            Mexico|  951|
|                 ?|  857|
|       Philippin

## Clean the data. 

### Missing data imputation.


In [8]:
#Missing data imputation - Impute the most common row for "?".
dfrawrp = dfraw.replace(["?"], ["Private"], ["workclass"])
dfrawrpl = dfrawrp.replace(["?"], ["Prof-specialty"], ["occupation"])
dfrawnona = dfrawrpl.replace(["?"], ["United-States"], ["native_country"])

In [9]:
dfrawnona.show()

+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|   education|      marital_status|       occupation| relationship|              race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|   Bachelors|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|   Bachelors|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|     HS-grad|            Div

### Convert strings to categorical values

In [10]:
# converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        # and then drops the original columns.
        # and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(dfrawnona, ["workclass", "education",
                                           "marital_status", "occupation",
                                           "relationship", "race", "sex", 
                                           "native_country", "income"])

In [11]:
dfnumeric.show()

+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|workclass|education|marital_status|occupation|relationship|race|sex|native_country|income|
+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|      3.0|      2.0|           1.0|       3.0|         1.0| 0.0|0.0|           0.0|   0.0|
|50.0| 83311.0|         0.0|         0.0|          13.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|0.0|           0.0|   0.0|
|38.0|215646.0|         0.0|         0.0|          40.0|      0.0|      0.0|           2.0|       8.0|         1.0| 0.0|0.0|           0.0|   0.0|
|53.0|234721.0|         0.0|         0.0|          40.0|      0.0|      5.0|           0.0|       8.0|         0.0| 1.

In [12]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        # For each given colum, create OneHotEncoder. 
        # dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", 
                                        "marital_status", "occupation", 
                                        "relationship", "race", "native_country"])        

In [13]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)

        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", 
                                        "marital_status", "occupation", 
                                        "relationship", "race", "native_country"])        

In [14]:
dfhot.show()

+----+--------+------------+------------+--------------+---+------+-------------+---------------+--------------+--------------+-------------+-------------+---------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|sex|income|    workclass|      education|marital_status|    occupation| relationship|         race| native_country|
+----+--------+------------+------------+--------------+---+------+-------------+---------------+--------------+--------------+-------------+-------------+---------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|0.0|   0.0|(8,[3],[1.0])| (16,[2],[1.0])| (7,[1],[1.0])|(14,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])| (41,[0],[1.0])|
|50.0| 83311.0|         0.0|         0.0|          13.0|0.0|   0.0|(8,[1],[1.0])| (16,[2],[1.0])| (7,[0],[1.0])|(14,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])| (41,[0],[1.0])|
|38.0|215646.0|         0.0|         0.0|          40.0|0.0|   0.0|(8,[0],[1.0])| (16,[0],[1.0])| (7,[2],[1.0])|(14,[8],[1.0])|(6,[1],[1.0])

### Create a feature vector

In [15]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass",
            "education","marital_status","occupation","relationship","native_country","race"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

In [16]:
lpoints.rdd.take(5)

[Row(features=SparseVector(103, {0: 39.0, 1: 2174.0, 3: 77516.0, 4: 40.0, 9: 1.0, 16: 1.0, 31: 1.0, 40: 1.0, 52: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 50.0, 3: 83311.0, 4: 13.0, 7: 1.0, 16: 1.0, 30: 1.0, 39: 1.0, 51: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 38.0, 3: 215646.0, 4: 40.0, 6: 1.0, 14: 1.0, 32: 1.0, 45: 1.0, 52: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 53.0, 3: 234721.0, 4: 40.0, 6: 1.0, 19: 1.0, 30: 1.0, 45: 1.0, 51: 1.0, 57: 1.0, 99: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 28.0, 3: 338409.0, 4: 40.0, 5: 1.0, 6: 1.0, 16: 1.0, 30: 1.0, 37: 1.0, 55: 1.0, 65: 1.0, 99: 1.0}), label=0.0)]

In [17]:
lpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(103,[0,1,3,4,9,1...|  0.0|
|(103,[0,3,4,7,16,...|  0.0|
|(103,[0,3,4,6,14,...|  0.0|
|(103,[0,3,4,6,19,...|  0.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,5,6,2...|  0.0|
|(103,[0,3,4,7,14,...|  1.0|
|(103,[0,1,3,4,5,6...|  1.0|
|(103,[0,1,3,4,6,1...|  1.0|
|(103,[0,3,4,6,15,...|  1.0|
|(103,[0,3,4,9,16,...|  1.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,6,20,...|  0.0|
|(103,[0,3,4,6,18,...|  1.0|
|(103,[0,3,4,6,22,...|  0.0|
|(103,[0,3,4,7,14,...|  0.0|
|(103,[0,3,4,6,14,...|  0.0|
|(103,[0,3,4,6,19,...|  0.0|
|(103,[0,3,4,5,7,1...|  1.0|
+--------------------+-----+
only showing top 20 rows



22/02/21 18:08:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## Divide the dataset into training and vaildation sets.

In [18]:
#Divide the dataset into training and vaildation sets.
splits = lpoints.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

In [19]:
adulttrain.write.saveAsTable("adulttrain")
adultvalid.write.saveAsTable("adultvalid")

                                                                                

## Train the model.

In [20]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
#The above lines are same as..
#lr = LogisticRegression()
#lrmodel = lr.setParams(regParam=0.01, maxIter=1000, fitIntercept=True).fit(adulttrain)

## Interpret the model parameters.

In [21]:
#Interpret the model parameters
print(lrmodel.coefficients)
print(lrmodel.intercept)

[0.01923574239021005,0.0001412567713545233,0.0005345433982858283,7.468996514495918e-07,0.025937936240791812,-0.4942121560351809,0.015230941503173693,-0.39235185799399974,0.050246149266874984,-0.09913678466767609,0.3006796826509167,0.5757486475400272,-0.38914282912494746,-1.3504782019945032,-0.33962026692403735,0.005919455581736684,0.723813221553179,1.1204994073462593,0.12355486451726463,-0.8995662810287836,0.23275573355121296,-1.0325258742212615,-1.549961163631644,1.5718562188994802,-1.3002055368750027,-0.8746893673892991,1.545352014351228,-1.1506287865386222,-1.272168102768705,-1.7005915138297092,0.8422591261883235,-0.6830361781906382,-0.341580760658498,-0.31947648285758207,-0.21921820395342098,-0.19952351634838605,0.9005756168583787,0.23339170738464454,0.02775257030717321,0.6782545005058548,-0.10202757701724546,0.16978654840038618,-0.7228713518993242,-0.2898393223564408,-0.11428995654021633,-0.5219879799218845,-0.8499175722858514,0.4311572738851774,0.275252091515546,-0.90935561958767

In [22]:
#Evaluate models using test dataset.
#First, transform the validation set.
validpredicts = lrmodel.transform(adultvalid)
validpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(103,[0,1,3,4,5,6...|  0.0|[0.97669000865399...|[0.72645094822246...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[-1.0044813089338...|[0.26806125539336...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[4.08329268568583...|[0.98342739344473...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[1.14195102952191...|[0.75803767026185...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[1.82526226538037...|[0.86119635988910...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[3.76264666426236...|[0.97730483402034...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[3.37956311766956...|[0.96705969104759...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[-2.2430535163278...|[0.09595034088688...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[0.07207379428718...|[0.51801065268758...|       0.0|
|(103,[0,1,3,4,5

## Output
rawPrediction : includes two values - log-odds that a sample doesn't and does belong to the category (making > 50,000).

probability : the probability that the sample is not in the category.

prediction : proability that the sample belongs to the category.

In [23]:
validpredicts.select("rawPrediction").collect()

[Row(rawPrediction=DenseVector([0.9767, -0.9767])),
 Row(rawPrediction=DenseVector([-1.0045, 1.0045])),
 Row(rawPrediction=DenseVector([4.0833, -4.0833])),
 Row(rawPrediction=DenseVector([1.142, -1.142])),
 Row(rawPrediction=DenseVector([1.8253, -1.8253])),
 Row(rawPrediction=DenseVector([3.7626, -3.7626])),
 Row(rawPrediction=DenseVector([3.3796, -3.3796])),
 Row(rawPrediction=DenseVector([-2.2431, 2.2431])),
 Row(rawPrediction=DenseVector([0.0721, -0.0721])),
 Row(rawPrediction=DenseVector([2.3297, -2.3297])),
 Row(rawPrediction=DenseVector([1.0571, -1.0571])),
 Row(rawPrediction=DenseVector([-2.1568, 2.1568])),
 Row(rawPrediction=DenseVector([-2.3964, 2.3964])),
 Row(rawPrediction=DenseVector([2.2462, -2.2462])),
 Row(rawPrediction=DenseVector([0.0075, -0.0075])),
 Row(rawPrediction=DenseVector([1.7858, -1.7858])),
 Row(rawPrediction=DenseVector([-1.6849, 1.6849])),
 Row(rawPrediction=DenseVector([1.9546, -1.9546])),
 Row(rawPrediction=DenseVector([-0.7803, 0.7803])),
 Row(rawPredic

In [24]:
validpredicts.select("probability").collect()

[Row(probability=DenseVector([0.7265, 0.2735])),
 Row(probability=DenseVector([0.2681, 0.7319])),
 Row(probability=DenseVector([0.9834, 0.0166])),
 Row(probability=DenseVector([0.758, 0.242])),
 Row(probability=DenseVector([0.8612, 0.1388])),
 Row(probability=DenseVector([0.9773, 0.0227])),
 Row(probability=DenseVector([0.9671, 0.0329])),
 Row(probability=DenseVector([0.096, 0.904])),
 Row(probability=DenseVector([0.518, 0.482])),
 Row(probability=DenseVector([0.9113, 0.0887])),
 Row(probability=DenseVector([0.7421, 0.2579])),
 Row(probability=DenseVector([0.1037, 0.8963])),
 Row(probability=DenseVector([0.0835, 0.9165])),
 Row(probability=DenseVector([0.9043, 0.0957])),
 Row(probability=DenseVector([0.5019, 0.4981])),
 Row(probability=DenseVector([0.8564, 0.1436])),
 Row(probability=DenseVector([0.1565, 0.8435])),
 Row(probability=DenseVector([0.8759, 0.1241])),
 Row(probability=DenseVector([0.3143, 0.6857])),
 Row(probability=DenseVector([0.042, 0.958])),
 Row(probability=DenseVector

## Evaluate the model.

In [25]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderROC:0.903142090070932


In [26]:
#Evaluate the model. metric : Area Under PR
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderPR:0.7517159927507255


### n-fold validation and the results.

In [27]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000])\
.addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)

In [28]:
print(cvmodel.bestModel.coefficients)
print(cvmodel.bestModel.intercept)
print(cvmodel.bestModel.getMaxIter())
print(cvmodel.bestModel.getRegParam())

[0.021297828181081977,0.00030969138219662316,0.0006279913775355038,8.71338567253578e-07,0.029247767203774457,-0.6704817325732532,0.03729019931096098,-0.44290929046384764,0.04684543784365653,-0.10345676112335166,0.2760027309211615,0.6261367209482775,-0.40258521239372413,-4.422631687924233,-0.3437745602873477,0.044413153225726906,0.8176557305020619,1.2487957289276075,0.14521278064306828,-1.092869229400446,0.2926509515045579,-1.2015736124674894,-1.873480141094347,1.7698681961279554,-1.559549701719106,-1.0567545841113481,1.7350617936426145,-1.3561887264725383,-1.5508679774625609,-5.540979974815715,1.579661547268367,-1.1527011578102129,-0.7753508117089638,-0.7275241153214389,-0.595137356147819,-0.6313421049709882,1.7253262298262089,0.23314790411842942,0.05945280388506215,0.7288671816344141,-0.0842512545274601,0.19347874553404923,-0.8381334558551228,-0.27699346479048775,-0.08530649097444577,-0.54866646382122,-0.9782075845677468,0.483243263503285,0.34608401945444794,-1.5395364761052934,0.4565

In [29]:
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))

0.9059650301194826

In [30]:
BinaryClassificationEvaluator().setMetricName("areaUnderPR").evaluate(cvmodel.bestModel.transform(adultvalid))

0.7635582438122058

In [31]:
BinaryClassificationEvaluator().setMetricName("areaUnderROC").evaluate(cvmodel.bestModel.transform(adultvalid))

0.9059737611609396

In [32]:
ss.stop()