In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
   .master("local") \
   .appName("Predicting Fire Dept Calls") \
   .config("spark.executor.memory", "6gb") \
   .getOrCreate()

In [None]:
df = spark.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferschema='true')\
                    .load('Fire_Department_Calls_for_Service.csv')


In [None]:
df.show(2)

In [None]:
df.select('Call Type Group').distinct().show()

In [None]:
df.groupBy('Call Type Group').count().show()

In [None]:
df2 = df.groupBy('Call Type Group').count()

In [None]:
graphDF = df2.toPandas()
graphDF = graphDF.sort_values('count', ascending=False)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
graphDF.plot(x='Call Type Group', y = 'count', kind='bar')
plt.title('Call Type Group by Count')
plt.show()

In [None]:
df.groupBy('Call Type').count().orderBy('count', ascending=False).show(100)

In [None]:
from pyspark.sql import functions as F
fireIndicator = df.select(df["Call Type"],F.when(df["Call Type"].like("%Fire%"),1).otherwise(0))
fireIndicator = fireIndicator.\
                withColumnRenamed('CASE WHEN Call Type LIKE %Fire% THEN 1 ELSE 0 END', 'Fire Indicator')
fireIndicator.show()

In [None]:
fireIndicator.groupBy('Fire Indicator').count().show()

In [None]:
df = df.withColumn("fireIndicator", F.when(df["Call Type"].like("%Fire%"),1).otherwise(0))

In [None]:
df.printSchema()

In [None]:
df.select('Call Type', 'fireIndicator').show(20)

In [None]:
df = df.select('fireIndicator', 
               'Zipcode of Incident',
               'Battalion',
               'Station Area',
               'Box', 
               'Number of Alarms',
               'Unit sequence in call dispatch',
               'Neighborhooods - Analysis Boundaries',
               'Fire Prevention District',
               'Supervisor District',
               'final priority')
df.show()

In [None]:
print('Total Rows')
df.count()

In [None]:
print('Rows without Null values')
df.dropna().count()

In [None]:
print('Row with Null Values')
df.count()-df.dropna().count()

In [None]:
df = df.dropna()

In [None]:
df.groupBy('fireIndicator').count().orderBy('count', ascending = False).show()

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
Neighborhoods_indexer = StringIndexer(inputCol='Neighborhooods - Analysis Boundaries', outputCol='Neighbors')
zip_indexer = StringIndexer(inputCol='Zipcode of Incident', outputCol='Zip')
batallion_indexer = StringIndexer(inputCol='Battalion', outputCol='Battalion_')
stationarea_indexer = StringIndexer(inputCol='Station Area', outputCol='StationArea')
box_indexer = StringIndexer(inputCol='Box', outputCol='Box_')
fireDistrict_indexer = StringIndexer(inputCol='Fire Prevention District', outputCol='FireDistrict')
supervisorDistrict_indexer = StringIndexer(inputCol='Supervisor District', outputCol='SupervisorDistrict')

In [None]:
Neighborhoods_indexer_model = Neighborhoods_indexer.fit(df)
zip_indexer_model  = zip_indexer.fit(df)
batallion_indexer_model  = batallion_indexer.fit(df)
stationarea_indexer_model  = stationarea_indexer.fit(df)
box_indexer_model  = box_indexer.fit(df)
fireDistrict_model  = fireDistrict_indexer.fit(df)
supervisorDistrict_model  = supervisorDistrict_indexer.fit(df)

In [None]:
df = Neighborhoods_indexer_model.transform(df)
df = zip_indexer_model.transform(df)
df = batallion_indexer_model.transform(df)
df = stationarea_indexer_model.transform(df)
df = box_indexer_model.transform(df)
df = fireDistrict_model.transform(df)
df = supervisorDistrict_model.transform(df)

In [None]:
df.columns

In [None]:
df.select('Neighborhooods - Analysis Boundaries', 'Neighbors').show()

In [None]:
df = df.select('fireIndicator',\
          'Neighbors',\
          'Zip',\
          'Battalion_',\
          'StationArea',\
          'Box_',\
          'Number Of Alarms',\
          'Unit sequence in call dispatch',\
          'FireDistrict',\
          'SupervisorDistrict',\
          'final priority')

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
features = ['Neighbors',
 'Zip',
 'Battalion_',
 'StationArea',
 'Box_',
 'Number Of Alarms',
 'Unit sequence in call dispatch',
 'FireDistrict',
 'SupervisorDistrict',
 'final priority']

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_vectors = VectorAssembler(
        inputCols = features,
        outputCol = "features")

In [None]:
df = feature_vectors.transform(df)

In [None]:
df.columns

In [None]:
df = df.drop("Neighbors",
 "Zip",
 "Battalion_",
 "StationArea",
 "Box_",
 "Number Of Alarms",
 "Unit sequence in call dispatch",
 "FireDistrict",
 "SupervisorDistrict",
 "final priority")

In [None]:
df = df.withColumnRenamed('fireIndicator', 'label')

In [None]:
df.show()

In [None]:
(trainDF, testDF) = df.randomSplit([0.75, 0.25], seed = 12345)

In [None]:
print(trainDF.count())
print(testDF.count())

In [None]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
LogisticRegressionModel = logreg.fit(trainDF)

In [None]:
df_predicted = LogisticRegressionModel.transform(testDF)

In [None]:
df_predicted.printSchema()

In [None]:
df_predicted.show(5)

In [None]:
df_predicted.crosstab('label', 'prediction').show()

In [None]:
from sklearn import metrics

In [None]:
actual = df_predicted.select('label').toPandas()

In [None]:
predicted = df_predicted.select('prediction').toPandas()

In [None]:
metrics.accuracy_score(actual, predicted)

In [None]:
df_predicted.groupBy('label').count().show()

In [None]:
df_predicted.describe('label').show()

In [None]:
##################################################

In [None]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(logreg.regParam, [0, 0.5, 1, 2]).\
    addGrid(logreg.elasticNetParam, [0, 0.5, 1]).\
    build()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

In [None]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=logreg, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

In [None]:
cv_model = cv.fit(df)

In [None]:
pred_training_cv = cv_model.transform(trainDF)

In [None]:
pred_test_cv = cv_model.transform(testDF)

In [None]:
print('Intercept: ' + str(cv_model.bestModel.intercept) + "\n"
     'coefficients: ' + str(cv_model.bestModel.coefficients))