In [9]:
from pyspark.sql import SparkSession
import pandas as pd 
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import math
ss = SparkSession.builder.getOrCreate()

In [10]:
ss.sql("DROP TABLE IF EXISTS test")

DataFrame[]

In [11]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True), 
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True), 
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)])


In [12]:
iris = ss.read.csv('../Data/iris.csv', schema = irisSchema, header=False)

In [13]:
iris = iris.select('sepal_width','petal_width')

In [14]:
iris.show(5)

+-----------+-----------+
|sepal_width|petal_width|
+-----------+-----------+
|        3.5|        0.2|
|        3.0|        0.2|
|        3.2|        0.2|
|        3.1|        0.2|
|        3.6|        0.2|
+-----------+-----------+
only showing top 5 rows



# Split training and test set.

In [16]:
train, test = iris.randomSplit([0.9, 0.1])
train.cache()
# test.write.saveAsTable("test")

DataFrame[sepal_width: double, petal_width: double]

# calculate covariance.¶

In [17]:
covariance = train.cov('sepal_width', 'petal_width')
covariance

-0.11103140830800402

In [18]:
variance = train.select(variance("sepal_width")).first()[0]
# variance

In [19]:
variance

0.19166058763931168

# Estimate coefficients and apply the equation
# where petal_width = coeff_0 * sepal_width + coeff_1
# coeff_0 = covariance(x,y)/variance(x)
# coeff_1 = mean(y) – coeff_0 * mean(x)

In [20]:
coeff_0 = covariance/variance

In [21]:
coeff_1 =  train.select(mean("petal_width")).first()[0] - coeff_0 * train.select(mean("sepal_width")).first()[0]


In [22]:
# test_output = ss.sql("SELECT sepal_width, petal_width, sepal_width * {0} + {1} AS prediction FROM test".format(coeff_0, coeff_1))

In [14]:
# test_output.show()

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

In [25]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

# Creat an RDD 

In [28]:
#load and convert the data
# create census RDD 
census_raw = sc.textFile("../Data/adult.raw", 4).map(lambda x:  x.split(","))
# convert the data to float or string 
census_raw = census_raw.map(lambda row:  [toDoubleSafe(x) for x in row])
# census_raw.take(5)

# Create DataFrame 

In [31]:
from pyspark.sql.types import *
# create adult schema 

adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])

In [32]:
dfraw = ss.createDataFrame(census_raw, adultschema)

In [33]:
dfraw.show(5)

+----+-----------------+--------+----------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| age|        workclass|  fnlwgt| education|     marital_status|        occupation|  relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+-----------------+--------+----------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|39.0|        State-gov| 77516.0| Bachelors|      Never-married|      Adm-clerical| Not-in-family| White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0| Self-emp-not-inc| 83311.0| Bachelors| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|          Private|215646.0|   HS-grad|           Divorced| Handlers-cleaners| Not-in-family| White|   Male|  

# Calculate the count values of the columns with missing values to get the most common ones 

In [21]:
dfraw.groupBy('workclass').count().orderBy('count', ascending= False).show(5)
dfraw.groupBy(dfraw["occupation"]).count().orderBy("count",ascending=False).show(5)
dfraw.groupBy(dfraw["native_country"]).count().orderBy("count",ascending=False).show(5)


+-----------------+-----+
|        workclass|count|
+-----------------+-----+
|          Private|33906|
| Self-emp-not-inc| 3862|
|        Local-gov| 3136|
|                ?| 2799|
|        State-gov| 1981|
+-----------------+-----+
only showing top 5 rows

+----------------+-----+
|      occupation|count|
+----------------+-----+
|  Prof-specialty| 6172|
|    Craft-repair| 6112|
| Exec-managerial| 6086|
|    Adm-clerical| 5611|
|           Sales| 5504|
+----------------+-----+
only showing top 5 rows

+--------------+-----+
|native_country|count|
+--------------+-----+
| United-States|43832|
|        Mexico|  951|
|             ?|  857|
|   Philippines|  295|
|       Germany|  206|
+--------------+-----+
only showing top 5 rows



# clean the data

## Missing data imputaion 

In [22]:
#  replace the ? --> dfraw.replace('value_toreplace','replacing value', 'column_name' )
dfraw_1 = dfraw.replace('?', 'Private', 'workclass')


In [23]:
# check if ? exists or aleady replaced 
dfraw_1.filter('workclass == "?"').show() # it is empty so it is replaced 

+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|marital_status|occupation|relationship|race|sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+



In [24]:
#  replace the ? --> dfraw.replace('value_toreplace','replacing value', 'column_name' ) in chain 
dfrawnona = dfraw_1.replace('?', 'Prof-specialty', 'occupation').replace('?', 'United-States', 'native_country')

In [25]:
# check if it exists 
dfrawnona.filter('occupation == "?"').show(5) #replaced

+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|marital_status|occupation|relationship|race|sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
+---+---------+------+---------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+



In [26]:
dfrawnona.show(5)

+----+-----------------+--------+----------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| age|        workclass|  fnlwgt| education|     marital_status|        occupation|  relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+-----------------+--------+----------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|39.0|        State-gov| 77516.0| Bachelors|      Never-married|      Adm-clerical| Not-in-family| White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0| Self-emp-not-inc| 83311.0| Bachelors| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|          Private|215646.0|   HS-grad|           Divorced| Handlers-cleaners| Not-in-family| White|   Male|  

# Convert string to numeric values 
## Logistic reg takes numeric values for modeling so we have to convert the strings to ints 

In [27]:
# check the string values 
dfrawnona.printSchema() # We have education, workclass, occupation, relationship, race, sex, native_country, income

root
 |-- age: double (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [28]:
# Convert strings to categorical numeric values
from pyspark.ml.feature import StringIndexer # import StringIndexer

def indexStringColumn(df, cols): 
    newdf = df #  assign original df to a newdf 
#     loop through the columns 
    for s in cols: 
#         replace the column 
        si = StringIndexer(inputCol=s, outputCol=s+'-num' )
#     si is estimator, fit the model 
        sm = si.fit(newdf)
#         transform the data set 
        newdf = sm.transform(newdf).drop(s)
        newdf = newdf.withColumnRenamed(s+"-num", s)
    return newdf
# pass all the columns that are strings 
dfnumeric = indexStringColumn(dfrawnona, ["workclass", "education",
                                           "marital_status", "occupation",
                                           "relationship", "race", "sex", 
                                           "native_country", "income"])
        


In [29]:
dfnumeric.show(5)

+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|workclass|education|marital_status|occupation|relationship|race|sex|native_country|income|
+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|      4.0|      2.0|           1.0|       3.0|         1.0| 0.0|0.0|           0.0|   0.0|
|50.0| 83311.0|         0.0|         0.0|          13.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|0.0|           0.0|   0.0|
|38.0|215646.0|         0.0|         0.0|          40.0|      0.0|      0.0|           2.0|       9.0|         1.0| 0.0|0.0|           0.0|   0.0|
|53.0|234721.0|         0.0|         0.0|          40.0|      0.0|      5.0|           0.0|       9.0|         0.0| 1.

In [30]:
# check if all the values are changed to numeric values 
dfnumeric.printSchema()

root
 |-- age: double (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- workclass: double (nullable = false)
 |-- education: double (nullable = false)
 |-- marital_status: double (nullable = false)
 |-- occupation: double (nullable = false)
 |-- relationship: double (nullable = false)
 |-- race: double (nullable = false)
 |-- sex: double (nullable = false)
 |-- native_country: double (nullable = false)
 |-- income: double (nullable = false)



In [31]:
# OneHotEncoder 
# # Expand the Columns to as many columns as there a distinct values in it and only one column will 1 and others 0 

In [32]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", 
                                        "marital_status", "occupation", 
                                        "relationship", "race", "native_country"])


In [33]:
dfhot.select('education','native_country','marital_status' ).show(5)

+--------------+--------------+--------------+
|     education|native_country|marital_status|
+--------------+--------------+--------------+
|(16,[2],[1.0])|(42,[0],[1.0])| (7,[1],[1.0])|
|(16,[2],[1.0])|(42,[0],[1.0])| (7,[0],[1.0])|
|(16,[0],[1.0])|(42,[0],[1.0])| (7,[2],[1.0])|
|(16,[5],[1.0])|(42,[0],[1.0])| (7,[0],[1.0])|
|(16,[2],[1.0])|(42,[9],[1.0])| (7,[0],[1.0])|
+--------------+--------------+--------------+
only showing top 5 rows



In [34]:
dfhot.show(5)

+----+--------+------------+------------+--------------+---+------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|sex|income|    workclass|     education|marital_status|    occupation| relationship|         race|native_country|
+----+--------+------------+------------+--------------+---+------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|0.0|   0.0|(9,[4],[1.0])|(16,[2],[1.0])| (7,[1],[1.0])|(15,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])|(42,[0],[1.0])|
|50.0| 83311.0|         0.0|         0.0|          13.0|0.0|   0.0|(9,[1],[1.0])|(16,[2],[1.0])| (7,[0],[1.0])|(15,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])|(42,[0],[1.0])|
|38.0|215646.0|         0.0|         0.0|          40.0|0.0|   0.0|(9,[0],[1.0])|(16,[0],[1.0])| (7,[2],[1.0])|(15,[9],[1.0])|(6,[1],[1.0])|(5,[0],[1.

# Create a feature Vector through VectorAssembler

Merge all the new vectors and the original columns into a single vector. 
◦Useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees.  
◦ML algorithms work with two columns called features and label by default.

In [35]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass",
            "education","marital_status","occupation","relationship","native_country","race"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

In [36]:
lpoints.rdd.take(5)

[Row(features=SparseVector(106, {0: 39.0, 1: 2174.0, 3: 77516.0, 4: 40.0, 10: 1.0, 17: 1.0, 32: 1.0, 41: 1.0, 54: 1.0, 59: 1.0, 101: 1.0}), label=0.0),
 Row(features=SparseVector(106, {0: 50.0, 3: 83311.0, 4: 13.0, 7: 1.0, 17: 1.0, 31: 1.0, 40: 1.0, 53: 1.0, 59: 1.0, 101: 1.0}), label=0.0),
 Row(features=SparseVector(106, {0: 38.0, 3: 215646.0, 4: 40.0, 6: 1.0, 15: 1.0, 33: 1.0, 47: 1.0, 54: 1.0, 59: 1.0, 101: 1.0}), label=0.0),
 Row(features=SparseVector(106, {0: 53.0, 3: 234721.0, 4: 40.0, 6: 1.0, 20: 1.0, 31: 1.0, 47: 1.0, 53: 1.0, 59: 1.0, 102: 1.0}), label=0.0),
 Row(features=SparseVector(106, {0: 28.0, 3: 338409.0, 4: 40.0, 5: 1.0, 6: 1.0, 17: 1.0, 31: 1.0, 38: 1.0, 57: 1.0, 68: 1.0, 102: 1.0}), label=0.0)]

In [37]:
lpoints.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(106,[0,1,3,4,10,...|  0.0|
|(106,[0,3,4,7,17,...|  0.0|
|(106,[0,3,4,6,15,...|  0.0|
|(106,[0,3,4,6,20,...|  0.0|
|(106,[0,3,4,5,6,1...|  0.0|
+--------------------+-----+
only showing top 5 rows



# Divide the dataset into training and vaildation sets.

In [38]:
lpoints.count()

48842

In [39]:
splits = lpoints.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

In [48]:
adulttrain.write.saveAsTable("adulttrain")
adultvalid.write.saveAsTable("adultvalid")

# Train the model.

In [41]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)

#The above lines are same as..
#lr = LogisticRegression()
#lrmodel = lr.setParams(regParam=0.01, maxIter=1000, fitIntercept=True).fit(adulttrain)

# Interpret the model parameters.

In [47]:

#Interpret the model parameters
# print(lrmodel.coefficients)
print(lrmodel.intercept)

-4.402248748487637


In [43]:
#Evaluate models using test dataset.
#First, transform the validation set.
validpredicts = lrmodel.transform(adultvalid)
validpredicts.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(106,[0,1,3,4,5,6...|  0.0|[-0.4272191290476...|[0.39479057301260...|       1.0|
|(106,[0,1,3,4,5,6...|  0.0|[-0.3200456820689...|[0.42066461482132...|       1.0|
|(106,[0,1,3,4,5,6...|  0.0|[0.29589415750928...|[0.57343850402302...|       0.0|
|(106,[0,1,3,4,5,6...|  0.0|[3.14568216857700...|[0.95873824969205...|       0.0|
|(106,[0,1,3,4,5,6...|  0.0|[2.96539417553716...|[0.95098603757053...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



# Evaluate Our Model 

In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
print(bceval.getMetricName() + ":" + str(bceval.evaluate(validpredicts)))

areaUnderROC:0.8967703011571972
