# Spark Logistic Regression

## predict which customers at risk and assigned them an account manager

In [8]:
#imports
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('marketing').getOrCreate()

#data
df = spark.read.csv('../Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv',inferSchema=True,header=True)
print(df.columns, '\n')
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age','Total_Purchase','Years','Num_Sites'], outputCol='features')
output = assembler.transform(df)

#split
model_df = output.select(['features','Churn'])
train, test = model_df.randomSplit([0.7,0.3])

#model
from pyspark.ml.classification import LogisticRegression
log_r = LogisticRegression(labelCol='Churn')
model = log_r.fit(train)
results = model.evaluate(test)

#evaluation
print('y_true vs y_hat statistics:')
results.predictions.describe().show()
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')
auc = evaluator.evaluate(results.predictions)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Churn',metricName='accuracy')
f1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Churn',metricName='f1')
acc = accuracy.evaluate(results.predictions)
f1_score = f1.evaluate(results.predictions)
print('area under curve: ',auc)
print('accuracy: ',acc)
print('f1 score: ',f1_score, '\n')

#prediction
model = log_r.fit(model_df)
df_new = spark.read.csv('../Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv',header=True,inferSchema=True)
output_new = assembler.transform(df_new)
results_new = model.transform(output_new)
print('account manager assignment table:')
results_new.select(['Company','prediction']).show()

['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company', 'Churn'] 

y_true vs y_hat statistics:
+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                267|                267|
|   mean|0.20224719101123595|0.14606741573033707|
| stddev| 0.4024298924858095|0.35383702753467533|
|    min|                  0|                0.0|
|    max|                  1|                1.0|
+-------+-------------------+-------------------+

area under curve:  0.7682576943140323
accuracy:  0.8838951310861424
f1 score:  0.8765064078065683 

account manager assignment table:
+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|   