In [0]:
#ML Lib and Spark

In [0]:
## Linear Regression

In [0]:
from pyspark.sql import SparkSession

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [0]:
training = spark.read.format("libsvm").load("/FileStore/tables/sample_linear_regression_data.txt")

In [0]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [0]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')


In [0]:
# Fit the model
lrModel = lr.fit(training)

In [0]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {}".format(str(lrModel.coefficients))) # For each feature...
print('\n')
print("Intercept:{}".format(str(lrModel.intercept)))

Coefficients: [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]


Intercept:0.14228558260358093


In [0]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary

In [0]:
trainingSummary.meanAbsoluteError

Out[15]: 8.145215527783876

In [0]:
trainingSummary.residuals.show()
print("RMSE: {}".format(trainingSummary.rootMeanSquaredError))
print("r2: {}".format(trainingSummary.r2))

+-------------------+
|          residuals|
+-------------------+
|-11.011130022096554|
| 0.9236590911176538|
|-4.5957401897776675|
|  -20.4201774575836|
|-10.339160314788181|
|-5.9552091439610555|
|-10.726906349283922|
|  2.122807193191233|
|  4.077122222293811|
|-17.316168071241652|
| -4.593044343959059|
|  6.380476690746936|
| 11.320566035059846|
|-20.721971774534094|
| -2.736692773777401|
| -16.66886934252847|
|  8.242186378876315|
|-1.3723486332690233|
|-0.7060332131264666|
|-1.1591135969994064|
+-------------------+
only showing top 20 rows

RMSE: 10.16309157133015
r2: 0.027839179518600154


In [0]:
## Train and Test 

all_data =training
train_data,test_data = all_data.randomSplit([0.7,0.3])

In [0]:
unlabeled_data = test_data.select('features')

In [0]:
## Train on the Trainig data 
correct_model = lr.fit(train_data)

In [0]:
## test the result on un-seen data
test_results = correct_model.evaluate(test_data)

In [0]:
test_results.residuals.show()
print("RMSE: {}".format(test_results.rootMeanSquaredError))

+-------------------+
|          residuals|
+-------------------+
|-26.285913799585526|
|-20.765355211476624|
|-21.202898348980654|
|   -26.902343792194|
| -17.13037813185135|
|-19.558046902962957|
|-19.647860609302217|
| -19.60820312921036|
|-14.629032854020487|
|-16.661607478565585|
|-14.805713566770892|
|-13.732866471836163|
|-11.842710096869224|
|-12.543299458357382|
| -18.17607177242256|
|-13.149704543167338|
|-14.155188487236765|
| -11.10925960461663|
|-10.489889440249684|
|-12.926963191853392|
+-------------------+
only showing top 20 rows

RMSE: 11.268057811848049


In [0]:
test_results.r2adj

Out[23]: -0.16193832319267942

In [0]:
##
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
data = spark.read.csv("/FileStore/tables/Ecommerce_Customers.csv",inferSchema=True,header=True)

In [0]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [0]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [0]:
## Setting Up DataFrame for Machine Learning 


In [0]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

Out[9]: ['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [0]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [0]:
output = assembler.transform(data)

In [0]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [0]:
output.head(1)

Out[16]: [Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [0]:
final_data = output.select("features",'Yearly Amount Spent')

In [0]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [0]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [0]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                159|
|   mean| 504.09510685050554|
| stddev|  80.30004323604713|
|    min|  282.4712457199145|
|    max|  725.5848140556806|
+-------+-------------------+



In [0]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [0]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data,)

In [0]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [26.16924521868724,38.75396299813981,0.7487475407397122,61.84820849538005] Intercept: -1079.4824596492251


In [0]:
test_results = lrModel.evaluate(test_data)

In [0]:
# Interesting results....
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -2.695266829890045|
|-2.4585274502712195|
|  8.205416898672865|
| 0.6945064051791405|
| 19.526949533056325|
|  5.245199425582882|
| 4.9423392030358855|
|-3.0906456286352295|
|  3.704705748847516|
| -6.964985686519071|
| -5.242880339397743|
|   4.50708801856382|
| 19.253290402703215|
| -6.904307299124525|
| -4.955310251048502|
| 3.5157741177186494|
|  -4.13035196676816|
|-10.401759114420202|
|-0.8427509788284056|
|-1.5092546146863697|
+-------------------+
only showing top 20 rows



In [0]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 10.093379555775577
MSE: 101.8763108569484
R2: 0.9841005650923851


In [0]:
## Logistic Regression

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('logregconsult').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/customer_churn.csv',inferSchema=True , header=True)

In [0]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [0]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [0]:
## we format for ML LIB 
from pyspark.ml.feature import VectorAssembler

In [0]:
 # we choose as in linear reg the column we want to use
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
final_data = output.select('features','churn')

In [0]:
train_churn,test_churn = final_data.randomSplit([0.7,0.3])

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
lr_churn = LogisticRegression(labelCol='churn')

In [0]:
fitted_churn_model = lr_churn.fit(train_churn)

In [0]:
predictions = fitted_churn_model.transform(test_churn)

In [0]:
training_sum = fitted_churn_model.summary

In [0]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                616|                616|
|   mean|0.16883116883116883|              0.125|
| stddev| 0.3749071612809894|0.33098768183794275|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [0]:
# Evaluation 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [0]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.18484049076904...|[0.98500367927964...|       0.0|
|[26.0,8787.39,1.0...|    1|[0.48982581561550...|[0.62006539809503...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.13943492458497...|[0.99920742848010...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.32233603621495...|[0.78957009952153...|       0.0|
|[29.0,5900.78,1.0...|    0|[3.57891031147356...|[0.97285151737690...|       0.0|
|[30.0,8677.28,1.0...|    0|[3.75452055437244...|[0.97712389532874...|       0.0|
|[30.0,10960.52,1....|    0|[2.18671983045756...|[0.89905059219601...|       0.0|
|[31.0,5387.75,0.0...|    0|[2.36613286885554...|[0.91420804033189...|       0.0|
|[31.0,8688.21,0.0...|    0|[6.08012283453184...|[0.99771732691463...|       0.0|
|[31.0,10182.6,1

In [0]:
## AUC 
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')
auc = churn_eval.evaluate(pred_and_labels.predictions)
auc
                                           

Out[66]: 0.7959444647424186

In [0]:
## Predict on unseen data 

In [0]:
final_lr_model = lr_churn.fit(final_data)

In [0]:
final_results = final_lr_model.transform(test_churn)

In [0]:
final_results.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.62604102846073...|[0.99030152709041...|       0.0|
|[26.0,8787.39,1.0...|    1|[0.58379147553953...|[0.64193935739595...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.81669975802044...|[0.99959721298165...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.48332795477865...|[0.81507472246572...|       0.0|
|[29.0,5900.78,1.0...|    0|[4.0675596878253,...|[0.98316901793628...|       0.0|
|[30.0,8677.28,1.0...|    0|[4.12525140140769...|[0.98409754309066...|       0.0|
|[30.0,10960.52,1....|    0|[2.38099181212439...|[0.91536630244322...|       0.0|
|[31.0,5387.75,0.0...|    0|[2.51766723717827...|[0.92537111537160...|       0.0|
|[31.0,8688.21,0.0...|    0|[6.65711730521018...|[0.99871680385537...|       0.0|
|[31.0,10182.6,1

In [0]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


Test Error = 0.0774648
