In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
   .master("local") \
   .appName("Predicting Fire Dept Calls") \
   .config("spark.executor.memory", "6gb") \
   .getOrCreate()

In [None]:
df = spark.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferschema='true')\
                    .load('Fire_Department_Calls_for_Service.csv')


In [None]:
df.show(2)

In [None]:
df.select('Call Type Group').distinct().show()

In [None]:
df.groupBy('Call Type Group').count().show()

In [None]:
df2 = df.groupBy('Call Type Group').count()

In [None]:
graphDF = df2.toPandas()
graphDF = graphDF.sort_values('count', ascending=False)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
graphDF.plot(x='Call Type Group', y = 'count', kind='bar')
plt.title('Call Type Group by Count')
plt.show()

In [None]:
df.groupBy('Call Type').count().orderBy('count', ascending=False).show(100)

In [None]:
from pyspark.sql import functions as F
fireIndicator = df.select(df["Call Type"],F.when(df["Call Type"].like("%Fire%"),1)\
                          .otherwise(0).alias('Fire Indicator'))
fireIndicator.show()

In [None]:
fireIndicator.groupBy('Fire Indicator').count().show()

In [None]:
df = df.withColumn("fireIndicator", F.when(df["Call Type"].like("%Fire%"),1).otherwise(0))

In [None]:
df.printSchema()

In [None]:
df.select('Call Type', 'fireIndicator').show(20)

In [None]:
df = df.select('fireIndicator', 
               'Zipcode of Incident',
               'Battalion',
               'Station Area',
               'Box', 
               'Number of Alarms',
               'Unit sequence in call dispatch',
               'Neighborhooods - Analysis Boundaries',
               'Fire Prevention District',
               'Supervisor District')
df.show(5)

In [None]:
print('Total Rows')
df.count()

In [None]:
print('Rows without Null values')
df.dropna().count()

In [None]:
print('Row with Null Values')
df.count()-df.dropna().count()

In [None]:
df = df.dropna()

In [None]:
df.groupBy('fireIndicator').count().orderBy('count', ascending = False).show()

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
column_names = df.columns[1:]
column_names

In [None]:
categoricalColumns = column_names
indexers = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"_Index")
    indexers += [stringIndexer]

In [None]:
models = []
for model in indexers:
    indexer_model = model.fit(df)
    models+=[indexer_model]
    
for i in models:
    df = i.transform(df)

In [None]:
df.columns

In [None]:
df.select('Neighborhooods - Analysis Boundaries', 'Neighborhooods - Analysis Boundaries_Index').show()

In [None]:
df = df.select(
          'fireIndicator',
          'Zipcode of Incident_Index',
          'Battalion_Index',
          'Station Area_Index',
          'Box_Index',
          'Number of Alarms_Index',
          'Unit sequence in call dispatch_Index',
          'Neighborhooods - Analysis Boundaries_Index',
          'Fire Prevention District_Index',
          'Supervisor District_Index')

In [None]:
df.printSchema()

In [None]:
df.show(5)

In [None]:
features = df.columns[1:]

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_vectors = VectorAssembler(
        inputCols = features,
        outputCol = "features")

In [None]:
df = feature_vectors.transform(df)

In [None]:
df.columns

In [None]:
df = df.drop( 'Zipcode of Incident_Index',
              'Battalion_Index',
              'Station Area_Index',
              'Box_Index',
              'Number of Alarms_Index',
              'Unit sequence in call dispatch_Index',
              'Neighborhooods - Analysis Boundaries_Index',
              'Fire Prevention District_Index',
              'Supervisor District_Index')

In [None]:
df = df.withColumnRenamed('fireIndicator', 'label')

In [None]:
df.show()

In [None]:
(trainDF, testDF) = df.randomSplit([0.75, 0.25], seed = 12345)

In [None]:
print(trainDF.count())
print(testDF.count())

In [None]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
LogisticRegressionModel = logreg.fit(trainDF)

In [None]:
df_predicted = LogisticRegressionModel.transform(testDF)

In [None]:
df_predicted.printSchema()

In [None]:
df_predicted.show(5)

In [None]:
df_predicted.crosstab('label', 'prediction').show()

In [None]:
from sklearn import metrics

In [None]:
actual = df_predicted.select('label').toPandas()

In [None]:
predicted = df_predicted.select('prediction').toPandas()

In [None]:
metrics.accuracy_score(actual, predicted)

In [None]:
df_predicted.groupBy('label').count().show()

In [None]:
df_predicted.describe('label').show()