Run on DataBricks

In [2]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [6]:
# original Dataset not available & no GitHUB repo in study material, so use Titanic dataset
# file_location = r'/FileStore/tables/data.csv'
file_location = 'data.csv'
spark = SparkSession.builder.appName('Logistic_regression_practice').getOrCreate()
df = spark.read.csv(file_location, inferSchema=True, sep=',', header=True)
df.show(10, False)

+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                             |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35.0|1    |0    |113803          |53

In [7]:
print((df.count(), len(df.columns)))

(891, 12)


In [8]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [12]:

df.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [14]:
df.groupBy('Embarked').mean().show()

+--------+------------------+-------------------+------------------+------------------+-------------------+-------------------+------------------+
|Embarked|  avg(PassengerId)|      avg(Survived)|       avg(Pclass)|          avg(Age)|         avg(SibSp)|         avg(Parch)|         avg(Fare)|
+--------+------------------+-------------------+------------------+------------------+-------------------+-------------------+------------------+
|       Q| 417.8961038961039|0.38961038961038963| 2.909090909090909|28.089285714285715|0.42857142857142855|0.16883116883116883|13.276029870129872|
|    null|             446.0|                1.0|               1.0|              50.0|                0.0|                0.0|              80.0|
|       C|445.35714285714283| 0.5535714285714286|1.8869047619047619| 30.81476923076923| 0.3869047619047619| 0.3630952380952381| 59.95414404761905|
|       S|  449.527950310559|0.33695652173913043|2.3509316770186337| 29.44539711191336| 0.5714285714285714|0.413043478

Index the Columns SEX & EMBARKED

In [16]:

si_sex = StringIndexer(inputCol='Sex', outputCol='sex_index')
df = si_sex.fit(df).transform(df)

si_embarked = StringIndexer(inputCol='Embarked', outputCol='embarked_index')
df = si_embarked.fit(df).transform(df)

df.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+--------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|sex_index|embarked_index|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+--------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|      0.0|           0.0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      1.0|           1.0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|      1.0|           0.0|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|      1.0|           0.0|
|          5|       

OneHotEncode the Embarked & Sex index columns.

In [18]:
encoder = OneHotEncoder(inputCols=['sex_index', 'embarked_index'], outputCols=['sex_vec', 'embarked_vec'])
df = encoder.fit(df).transform(df)
df.show(10, False)

+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+--------------+-------------+-------------+
|PassengerId|Survived|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|sex_index|embarked_index|sex_vec      |embarked_vec |
+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+---------+--------------+-------------+-------------+
|1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |0.0      |0.0           |(1,[0],[1.0])|(2,[0],[1.0])|
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |1.0      |1.0           |(1,[],[])    |(2,[1],[1.0])|
|3   

In [21]:
df.groupBy('sex_vec').count().orderBy('count', ascending=False).show(5, False)

+-------------+-----+
|sex_vec      |count|
+-------------+-----+
|(1,[0],[1.0])|577  |
|(1,[],[])    |314  |
+-------------+-----+



Use VectorAssembler to group features

In [44]:

assembler = VectorAssembler(inputCols=['sex_vec', 'embarked_vec', 'Pclass', 'SibSp'], outputCol='features')
df = assembler.transform(df)
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+---------+--------------+-------------+-------------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|sex_index|embarked_index|      sex_vec| embarked_vec|            features|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+---------+--------------+-------------+-------------+--------------------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599|71.2833|  C85|       C|      1.0|           1.0|    (1,[],[])|(2,[1],[1.0])|[0.0,0.0,1.0,1.0,...|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|  113803|   53.1| C123|       S|      1.0|           0.0|    (1,[],[])|(2,[0],[1.0])|[0.0,1.0,0.0,1.0,...|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|   17463|51.8625|  E4

In [53]:
model_df = df.select(['features', 'Survived'])
train_df, test_df = model_df.randomSplit([0.75, 0.25])
model_df.columns

['features', 'Survived']

In [None]:

log_reg = LogisticRegression(labelCol='Survived').fit(train_df)
train_results = log_reg.evaluate(train_df).predictions
train_results.printSchema()

Using a Pipeline would be more efficient, so to summarize the steps

In [None]:

# ... read data
# ...split to train & test
# create the stages
stage_1 = StringIndexer(inputCol= 'Sex', outputCol='Sex_index')
stage_2 = StringIndexer(inputCol= 'Embarked',outputCol='Embarked_index')
stage_3 = OneHotEncoder(inputCols=[stage_1.getOutputCol(),stage_2.getOutputCol()], outputCols= ['Sex_vec', 'Embarked_vec'])
stage_4 = VectorAssembler(inputCols=['Pclass', 'Age', 'SibSp','Sex_vec','Embarked_vec'], outputCol='features')
stage_5 = LogisticRegression(featuresCol='features', labelCol='Survived')

# initialize the Pipeline
log_reg_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3,stage_4, stage_5])

# fit & transform
model = log_reg_pipeline.fit(train_df)
train_df = model.transform(train_df)
test_df=model.transform(test_df)