In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Load data from weather.csv into a dataframe
df = pd.read_csv('weather.csv')

# Encode the categorical columns (Description) with n-1 columns
df = pd.get_dummies(df, drop_first=True)

# Scale columns with MinMax scaler
mm_scale = MinMaxScaler().fit_transform(df)
df = pd.DataFrame(mm_scale, columns=df.columns)

# Shuffle data
df_shuffled = shuffle(df)

df_shuffled

Unnamed: 0,Temperature_c,Humidity,Wind_Speed_kmh,Wind_Bearing_degrees,Visibility_km,Pressure_millibars,Rain,Description_Normal,Description_Warm
9009,0.666361,0.55,0.172664,0.821727,0.966,0.974316,1.0,1.0,0.0
438,0.430534,0.85,0.228908,0.445682,0.375,0.975377,1.0,0.0,0.0
6740,0.487957,0.76,0.695494,0.389972,0.620,0.952381,1.0,1.0,0.0
30,0.606649,0.93,0.060478,0.473538,0.977,0.971714,1.0,1.0,0.0
70,0.607840,0.64,0.148171,0.894150,0.620,0.974861,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
8261,0.677077,0.58,0.238282,0.384401,0.929,0.969408,1.0,0.0,1.0
4461,0.396923,0.62,0.078923,0.155989,0.644,0.973110,1.0,0.0,0.0
8078,0.652807,0.30,0.144844,0.311978,1.000,0.972058,1.0,1.0,0.0
5903,0.369997,0.92,0.120351,0.476323,0.164,0.995762,1.0,0.0,0.0


In [3]:
# Split into features and targets
dep_var = 'Rain' # Will be the target
x_values = df_shuffled.drop(dep_var, axis=1)
y_values = df_shuffled[dep_var]


# Split into training and test data
x_train, x_test, \
y_train, y_test = train_test_split(x_values, y_values, 
                                   test_size=0.33)

In [4]:
# Create a grid search to determine the best values for hyperparams
grid = {'penalty': ['l1', 'l2'],
        'C': np.linspace(1, 10, 10),
        'solver': ['liblinear']
       }

# Create model with best f1 score possible from grid
lgr_grid_model = GridSearchCV(LogisticRegression(max_iter=500),
                              grid, scoring='f1')

# Fit model
lgr_grid_model.fit(x_train, y_train)
print('Done\n')
lgr_grid_model

Done



In [5]:
# Generate predicted probablities and class of rain
predicted_prob = lgr_grid_model.predict_proba(x_test)[:,1]

predicted_class = lgr_grid_model.predict(x_test)

In [6]:
# Create a confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, predicted_class))
cm['Total'] = np.sum(cm, axis=1)
col_sums = pd.DataFrame(np.sum(cm, axis=0)).T
cm = pd.concat([cm, col_sums], ignore_index=True)

# Set field and index labels
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm.set_index([['Actual No', 'Actual Yes', 'Total']], inplace=True)
cm

Unnamed: 0,Predicted No,Predicted Yes,Total
Actual No,356,5,361
Actual Yes,10,2929,2939
Total,366,2934,3300


In [7]:
# Print classification report
print(classification_report(y_test, predicted_class))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       361
         1.0       1.00      1.00      1.00      2939

    accuracy                           1.00      3300
   macro avg       0.99      0.99      0.99      3300
weighted avg       1.00      1.00      1.00      3300

