In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Create a Spark Session
spark = SparkSession.builder.appName("LOGISTIC").getOrCreate()

In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('weather_august.csv')
# Create a Pandas Dataframe
aug_df = spark.createDataFrame(df)
# Convert the PySpark DataFrame to a Pandas DataFrame
aug_df_pd = aug_df.toPandas()

In [None]:
# Transform 'Precipitation' column into qualitative variable
aug_df_pd['Precipitation'] = df['Precipitation'].apply(lambda x: 1 if x > 0 else 0)

# Show the Dataframe
aug_df_pd.head()

In [None]:
# Table of absolute frequencies of the 'Precipitation' variable

aug_df_pd['Precipitation'].value_counts()

In [None]:
import seaborn as sns 
import statsmodels.api as sm 
import numpy as np 
from scipy import stats
from statsmodels.iolib.summary2 import summary_col 
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Estimation of a binary logistic model
prec_model = smf.glm(formula='Precipitation ~  Avarage_Temperature +\
                                Global_Radiation +\
                                Avarage_Atmospheric_Pressure +\
                                Avarage_Dew_Temperature +\
                                Avarage_Relative_Humidity +\
                                Avarage_Wind_Speed', data=aug_df_pd,
                         family=sm.families.Binomial()).fit()

# Model parameters
prec_model.summary()

In [None]:
# Model outputs through the 'summary_col' function
summary_col([prec_model],
            model_names=["MODEL"],
            stars=True,
            info_dict = {
                'N':lambda x: "{0:d}".format(int(x.nobs)),
                'Log-lik':lambda x: "{:.2f}".format(x.llf)
        })

In [None]:
# Making predictions for the 'prec_model'.
prec_model.predict(pd.DataFrame({'Avarage_Temperature':[12.15],
                                'Global_Radiation':[0.1],
                                'Avarage_Atmospheric_Pressure':[929.3],
                                'Avarage_Dew_Temperature':[12.1],
                                'Avarage_Relative_Humidity':[100.0],
                                'Avarage_Wind_Speed':[2.7]}))

In [None]:
# Construction of a confusion matrix

# Adding predicted probability values ​​to the database
aug_df_pd['phat'] = prec_model.predict()

# Viewing the database with the 'phat' variable
aug_df_pd.head()

In [None]:
# Construction of function to define the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score,\
    ConfusionMatrixDisplay, recall_score

def matriz_confusao(predicts, observed, cutoff):
    
    values = predicts.values
    
    binary_prediction = []
        
    for item in values:
        if item < cutoff:
            binary_prediction.append(0)
        else:
            binary_prediction.append(1)
           
    cm = confusion_matrix(binary_prediction, observed)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.xlabel('True')
    plt.ylabel('Classified')
    plt.gca().invert_xaxis()
    plt.gca().invert_yaxis()
    plt.show()
        
    sensitivity = recall_score(observed, binary_prediction, pos_label=1)
    specificity = recall_score(observed, binary_prediction, pos_label=0)
    accuracy = accuracy_score(observed, binary_prediction)

    # Visualizing the main indicators of this confusion matrix
    indicators = pd.DataFrame({'Sensitivity':[sensitivity],
                                'Specificity':[specificity],
                                'Accuracy':[accuracy]})
    return indicators

In [None]:
# Confusion matrices proper

# Confusion matrix for cutoff = 0.5
matriz_confusao(observed=aug_df_pd['Precipitation'],
                predicts=aug_df_pd['phat'], 
                cutoff=0.5)

# Confusion matrix for cutoff = 0.3
matriz_confusao(observed=aug_df_pd['Precipitation'],
                predicts=aug_df_pd['phat'], 
                cutoff=0.3)

# Confusion matrix for cutoff = 0.7
matriz_confusao(observed=aug_df_pd['Precipitation'],
                predicts=aug_df_pd['phat'], 
                cutoff=0.3)

In [None]:
# Construction of the ROC curve
from sklearn.metrics import roc_curve, auc

# 'roc_curve' function from sklearn's 'metrics' package
fpr, tpr, thresholds =roc_curve(aug_df_pd['Precipitation'],aug_df_pd['phat'])
roc_auc = auc(fpr, tpr)

# Calculation of the GINI coefficient
gini = (roc_auc - 0.5)/(0.5)

# Plotting the ROC curve
plt.figure(figsize=(10,10))
plt.plot(fpr,tpr, '-o', color='navy')
plt.plot(fpr,fpr, color='gray')
plt.title('Área abaixo da curva: %g' % roc_auc.round(4) +
          ' | Coeficiente de GINI: %g' % gini.round(4), fontsize=17)
plt.xlabel('1 - Especificidade', fontsize=15)
plt.ylabel('Sensitividade', fontsize=15)
plt.show()

In [None]:
# Construction of the sigmoid
# Probability of event depending on the variable 'Avarage_Relative_Humidity', for example

plt.figure(figsize=(15,10))
sns.regplot(x=aug_df_pd['Avarage_Relative_Humidity'], y=aug_df_pd['Precipitation'],
            data=aug_df_pd, logistic=True, ci=None, color='navy',
            marker='o', scatter_kws={'color':'navy', 'alpha':0.5, 's':170})
plt.axhline(y = 0.5, color = 'grey', linestyle = ':')
plt.xlabel('Avarage Relative Humidity', fontsize=17)
plt.ylabel('Precipitation Probability', fontsize=17)
plt.show