# Spark Logistic Regression

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

spark = SparkSession.builder.appName('mylogreg').getOrCreate()

CPATH = "/home/bm/spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/"

data = spark.read.format('libsvm').load(os.path.join(CPATH,'Logistic_Regression/sample_libsvm_data.txt'))
titanic = spark.read.csv(os.path.join(CPATH,'Logistic_Regression/titanic.csv'),inferSchema=True,header=True)

In [2]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [3]:
my_log_reg_model = LogisticRegression()

In [4]:
fitted_logreg = my_log_reg_model.fit(data)

In [5]:
log_summary = fitted_logreg.summary

In [6]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [7]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111048...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126969...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170264...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200596...|       1.0|
|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|
|  1.0|(692,[158,159,160...|[-20.337256674833...|[1.47109814695572...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102604...|       1.0|
|  0.0|(692,[154,155,156...|[19.2708803215612...|[0.99999999572670...|       0.0|
|  0.0|(692,[127

In [8]:
train, test = data.randomSplit([0.7,0.3])

In [9]:
final_model = LogisticRegression()

In [10]:
fit_final = final_model.fit(train)

In [11]:
prediction_and_labels = fit_final.evaluate(test)

In [12]:
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[24.1362385764146...|[0.99999999996705...|       0.0|
|  0.0|(692,[123,124,125...|[33.9401550256967...|[0.99999999999999...|       0.0|
|  0.0|(692,[123,124,125...|[23.0124765105226...|[0.99999999989865...|       0.0|
|  0.0|(692,[124,125,126...|[33.7636057250169...|[0.99999999999999...|       0.0|
|  0.0|(692,[126,127,128...|[20.3264401898202...|[0.99999999851290...|       0.0|
|  0.0|(692,[126,127,128...|[33.8269860949887...|[0.99999999999999...|       0.0|
|  0.0|(692,[126,127,128...|[23.0589464158829...|[0.99999999990325...|       0.0|
|  0.0|(692,[126,127,128...|[31.5352925439851...|[0.99999999999997...|       0.0|
|  0.0|(692,[126,127,128...|[22.9443411046592...|[0.99999999989150...|       0.0|
|  0.0|(692,[127

In [13]:
my_eval = BinaryClassificationEvaluator()

In [14]:
my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)

In [15]:
my_final_roc

1.0

### Titanic

In [16]:
titanic.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [17]:
titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [18]:
titanic.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [19]:
my_cols = titanic.select([ 'Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

In [20]:
my_final_data = my_cols.na.drop()

In [21]:
gender_indexer = StringIndexer(inputCol="Sex",outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex",outputCol="SexVec")

In [22]:
embark_indexer = StringIndexer(inputCol="Embarked",outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex",outputCol="EmbarkVec")

In [23]:
assembler = VectorAssembler(inputCols=["Pclass","SexVec","EmbarkVec","Age","SibSp","Parch","Fare"],
                           outputCol="features")

In [24]:
log_reg_titanic = LogisticRegression(featuresCol="features",labelCol="Survived")

In [25]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, 
                            embark_encoder, assembler, log_reg_titanic])

In [26]:
train, test = my_final_data.randomSplit([0.7,0.3])

In [27]:
fit_model = pipeline.fit(train)

In [28]:
results = fit_model.transform(test)

In [29]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="Survived")

In [30]:
results.select("Survived","Prediction").show()

+--------+----------+
|Survived|Prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [31]:
AUC = my_eval.evaluate(results)

In [32]:
AUC

0.7947080291970803