In [1]:
hr_data = spark.read.csv('HR_comma_sep.csv',inferSchema=True,header=True)

In [2]:
hr_data.head()

Row(satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low')

In [3]:
hr_data.printSchema()

root
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)



In [4]:
#Get unique data of sales col
hr_data[['sales']].distinct().collect()

[Row(sales=u'management'),
 Row(sales=u'product_mng'),
 Row(sales=u'marketing'),
 Row(sales=u'sales'),
 Row(sales=u'hr'),
 Row(sales=u'accounting'),
 Row(sales=u'support'),
 Row(sales=u'IT'),
 Row(sales=u'technical'),
 Row(sales=u'RandD')]

In [5]:
#Rename col from sales to dept
hr_data = hr_data.withColumnRenamed('sales','dept')

In [6]:
hr_data.printSchema()

root
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: string (nullable = true)



In [7]:
#Get unique data of sales col

#hr_data[['salary']].distinct().collect()

hr_data.select('salary').distinct().collect()

[Row(salary=u'low'), Row(salary=u'high'), Row(salary=u'medium')]

In [8]:
hr_data.describe()['summary','satisfaction_level','left','number_project'].show()

+-------+-------------------+-------------------+------------------+
|summary| satisfaction_level|               left|    number_project|
+-------+-------------------+-------------------+------------------+
|  count|              14999|              14999|             14999|
|   mean| 0.6128335222348166| 0.2380825388359224|  3.80305353690246|
| stddev|0.24863065106114257|0.42592409938029885|1.2325923553183513|
|    min|               0.09|                  0|                 2|
|    max|                1.0|                  1|                 7|
+-------+-------------------+-------------------+------------------+



### Featurization - Convert string data to numbers
* dept & salary are categorical information
* Need to convert them to number

In [9]:
import pyspark.ml.feature as ft

#StringIndexer - converts string data to numbers
#input cols are dept 7 salary.
#output are *_en

transformer_dept = ft.StringIndexer(inputCol='dept', outputCol='dept_en')
transformer_salary = ft.StringIndexer(inputCol='salary', outputCol='salary_en')

In [11]:
# Convert numerical data into vector
# VectorAssembler for creating vector

In [13]:
hr_data.columns

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'dept',
 'salary']

In [17]:
#Convert all numerical data to vector
featurescreator = ft.VectorAssembler(inputCols=['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years','dept_en','salary_en'], outputCol='features')

In [18]:
import pyspark.ml.classification as cl

In [19]:
#creating estimator
logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='left')

In [20]:
from pyspark.ml import Pipeline

In [21]:
#create pipeline connecting 3 transformers & one estimator
pipeline = Pipeline(stages=[transformer_dept, 
                            transformer_salary, 
                            featurescreator,
                            logistic])

In [22]:
#Split data for test & train
#seed controls the random data generated
hr_data_train, hr_data_test = hr_data.randomSplit([0.7,0.3],seed=100)

In [24]:
#Training the pipeline
model = pipeline.fit(hr_data_train)

In [25]:
test_out = model.transform(hr_data_test)

In [26]:
test_out

DataFrame[satisfaction_level: double, last_evaluation: double, number_project: int, average_montly_hours: int, time_spend_company: int, Work_accident: int, left: int, promotion_last_5years: int, dept: string, salary: string, dept_en: double, salary_en: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [37]:
test_out[['probability']].take(5)

[Row(probability=DenseVector([0.3824, 0.6176])),
 Row(probability=DenseVector([0.327, 0.673])),
 Row(probability=DenseVector([0.3909, 0.6091])),
 Row(probability=DenseVector([0.359, 0.641])),
 Row(probability=DenseVector([0.3142, 0.6858]))]

In [31]:
#To find accuracy of the algo under processing
import pyspark.ml.evaluation as ev

In [33]:
#BinaryClassification
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', 
                                             labelCol='left')

In [34]:
evaluator.evaluate(test_out)

0.822542214798108

In [45]:
test_out[['prediction']].take(5)

[Row(prediction=1.0),
 Row(prediction=1.0),
 Row(prediction=1.0),
 Row(prediction=1.0),
 Row(prediction=1.0)]

In [48]:
test_out[hr_data.columns].show(5)

+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years|       dept|salary|
+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+
|              0.09|           0.77|             5|                 275|                 4|            0|   1|                    0|product_mng|medium|
|              0.09|           0.77|             6|                 244|                 4|            0|   1|                    0|product_mng|   low|
|              0.09|           0.77|             6|                 256|                 5|            0|   1|                    0|    support|medium|
|              0.09|           0.77|             6|                 282|                

In [51]:
hr_data.corr(col1='satisfaction_level', col2='left')

-0.3883749834241161