# Spark Logistic Regression

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

spark = SparkSession.builder.appName('mylogreg').getOrCreate()

CPATH = "/home/bm/spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/"

data = spark.read.format('libsvm').load(os.path.join(CPATH,'Logistic_Regression/sample_libsvm_data.txt'))
titanic = spark.read.csv(os.path.join(CPATH,'Logistic_Regression/titanic.csv'),inferSchema=True,header=True)
churn = spark.read.csv(os.path.join(CPATH,'Logistic_Regression/customer_churn.csv'),inferSchema=True,header=True)
newbs = spark.read.csv(os.path.join(CPATH,'Logistic_Regression/new_customers.csv'),inferSchema=True,header=True)

In [2]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [3]:
my_log_reg_model = LogisticRegression()

In [4]:
fitted_logreg = my_log_reg_model.fit(data)

In [5]:
log_summary = fitted_logreg.summary

In [6]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [7]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111048...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126969...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170264...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200596...|       1.0|
|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|
|  1.0|(692,[158,159,160...|[-20.337256674833...|[1.47109814695572...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102604...|       1.0|
|  0.0|(692,[154,155,156...|[19.2708803215612...|[0.99999999572670...|       0.0|
|  0.0|(692,[127

In [8]:
train, test = data.randomSplit([0.7,0.3])

In [9]:
final_model = LogisticRegression()

In [10]:
fit_final = final_model.fit(train)

In [11]:
prediction_and_labels = fit_final.evaluate(test)

In [12]:
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[122,123,124...|[19.1921741025337...|[0.99999999537677...|       0.0|
|  0.0|(692,[123,124,125...|[17.3792087790571...|[0.99999997166615...|       0.0|
|  0.0|(692,[124,125,126...|[46.9997749871722...|[1.0,3.8748694259...|       0.0|
|  0.0|(692,[124,125,126...|[17.1383214567568...|[0.99999996394864...|       0.0|
|  0.0|(692,[126,127,128...|[16.7460035505054...|[0.99999994662928...|       0.0|
|  0.0|(692,[126,127,128...|[25.8969551149472...|[0.99999999999433...|       0.0|
|  0.0|(692,[127,128,129...|[21.6762712291015...|[0.99999999961441...|       0.0|
|  0.0|(692,[129,130,131...|[14.6904797754720...|[0.99999958312532...|       0.0|
|  0.0|(692,[153,154,155...|[27.5232630287795...|[0.99999999999888...|       0.0|
|  0.0|(692,[153

In [13]:
my_eval = BinaryClassificationEvaluator()

In [14]:
my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)

In [15]:
my_final_roc

1.0

### Titanic

In [16]:
titanic.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [17]:
titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [18]:
titanic.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [19]:
my_cols = titanic.select([ 'Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

In [20]:
my_final_data = my_cols.na.drop()

In [21]:
gender_indexer = StringIndexer(inputCol="Sex",outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex",outputCol="SexVec")

In [22]:
embark_indexer = StringIndexer(inputCol="Embarked",outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex",outputCol="EmbarkVec")

In [23]:
assembler = VectorAssembler(inputCols=["Pclass","SexVec","EmbarkVec","Age","SibSp","Parch","Fare"],
                           outputCol="features")

In [24]:
log_reg_titanic = LogisticRegression(featuresCol="features",labelCol="Survived")

In [25]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, 
                            embark_encoder, assembler, log_reg_titanic])

In [26]:
train, test = my_final_data.randomSplit([0.7,0.3])

In [27]:
fit_model = pipeline.fit(train)

In [28]:
results = fit_model.transform(test)

In [29]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="Survived")

In [30]:
results.select("Survived","Prediction").show()

+--------+----------+
|Survived|Prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [31]:
AUC = my_eval.evaluate(results)

In [32]:
AUC

0.7612297496318114

### Customer Churn

In [33]:
churn.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [34]:
churn.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [35]:
churn.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [36]:
churn.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [37]:
newbs.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [38]:
assembler = VectorAssembler(inputCols=['Age','Total_Purchase','Account_Manager','Years','Num_Sites'],
                           outputCol="features")

In [39]:
output = assembler.transform(churn)

In [40]:
final_data = output.select('features','churn')

In [41]:
train, test = final_data.randomSplit([0.7,0.3])

In [42]:
lr_churn = LogisticRegression(labelCol='churn')

In [43]:
fitted_churn_model = lr_churn.fit(train)

In [44]:
training_sum = fitted_churn_model.summary

In [45]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                614|                614|
|   mean|0.16123778501628663|0.12866449511400652|
| stddev| 0.3680499719674389|0.33510122374678997|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [46]:
pred_and_labels = fitted_churn_model.evaluate(test)

In [47]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.44341008647343...|[0.60907128544157...|       0.0|
|[27.0,8628.8,1.0,...|    0|[5.30838771862706...|[0.99507447991572...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.34579118969599...|[0.79344068737502...|       0.0|
|[28.0,11128.95,1....|    0|[4.02218003898786...|[0.98240138979851...|       0.0|
|[29.0,11274.46,1....|    0|[4.36492834553542...|[0.98744408924557...|       0.0|
|[29.0,12711.15,0....|    0|[5.32395502997329...|[0.99515019428770...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.50977318465689...|[0.97096457038382...|       0.0|
|[30.0,8677.28,1.0...|    0|[3.97510197664810...|[0.98156870500842...|       0.0|
|[30.0,10744.14,1....|    1|[1.53642403710269...|[0.82294428401206...|       0.0|
|[30.0,12788.37,

In [48]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn')

In [49]:
auc = churn_eval.evaluate(pred_and_labels.predictions)

In [50]:
auc

0.7455569461827284

### predict on new data

In [51]:
final_lr_model = lr_churn.fit(final_data)

In [52]:
test_new_customers = assembler.transform(newbs)

In [53]:
final_results = final_lr_model.transform(test_new_customers)

In [54]:
final_results.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+



In [55]:
test_new_customers.describe().show()

+-------+-------------+------------------+-----------------+------------------+-----------------+------------------+--------------------+----------------+
|summary|        Names|               Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|         Company|
+-------+-------------+------------------+-----------------+------------------+-----------------+------------------+--------------------+----------------+
|  count|            6|                 6|                6|                 6|                6|                 6|                   6|               6|
|   mean|         null|35.166666666666664|7607.156666666667|0.8333333333333334|6.808333333333334|12.333333333333334|                null|            null|
| stddev|         null| 15.71517313511584|4346.008232825459| 0.408248290463863|3.708737880555414|3.3862466931200785|                null|            null|
|    min|Andrew Mccall|              22.0|            100.0|          