In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Load data from weather.csv into a dataframe
df = pd.read_csv('weather.csv')

# Encode the categorical columns (Description) with n-1 columns
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies

Unnamed: 0,Temperature_c,Humidity,Wind_Speed_kmh,Wind_Bearing_degrees,Visibility_km,Pressure_millibars,Rain,Description_Normal,Description_Warm
0,-0.555556,0.92,11.2700,130,8.0500,1021.60,0,0,0
1,21.111111,0.73,20.9300,330,16.1000,1017.00,1,0,1
2,16.600000,0.97,5.9731,193,14.9086,1013.99,1,1,0
3,1.600000,0.82,3.2200,300,16.1000,1031.59,1,0,0
4,2.194444,0.60,10.8836,116,9.9820,1020.88,1,0,0
...,...,...,...,...,...,...,...,...,...
9995,10.022222,0.95,10.2396,20,4.0089,1007.41,1,1,0
9996,8.633333,0.64,11.0446,80,9.9820,1031.33,1,1,0
9997,5.977778,0.93,11.0446,269,14.9086,1014.21,1,1,0
9998,9.788889,0.78,8.1788,231,7.8246,1005.02,1,1,0


In [3]:
# Scale data (solver will not converge even with stupid-high max_iter otherwise)
mm_scale = MinMaxScaler().fit_transform(df_dummies)
df_dummies = pd.DataFrame(mm_scale, columns=df_dummies.columns)
df_dummies

Unnamed: 0,Temperature_c,Humidity,Wind_Speed_kmh,Wind_Bearing_degrees,Visibility_km,Pressure_millibars,Rain,Description_Normal,Description_Warm
0,0.350582,0.92,0.211672,0.362117,0.500,0.977252,0.0,0.0,0.0
1,0.707757,0.73,0.393106,0.919220,1.000,0.972852,1.0,0.0,1.0
2,0.633391,0.97,0.112186,0.537604,0.926,0.969973,1.0,1.0,0.0
3,0.386116,0.82,0.060478,0.835655,1.000,0.986809,1.0,0.0,0.0
4,0.395915,0.60,0.204415,0.323120,0.620,0.976564,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
9995,0.524956,0.95,0.192319,0.055710,0.249,0.963678,1.0,1.0,0.0
9996,0.502061,0.64,0.207439,0.222841,0.620,0.986560,1.0,1.0,0.0
9997,0.458284,0.93,0.207439,0.749304,0.926,0.970183,1.0,1.0,0.0
9998,0.521110,0.78,0.153614,0.643454,0.486,0.961392,1.0,1.0,0.0


In [4]:
# Shuffle the data
df_shuffled = shuffle(df_dummies, random_state=42)

In [5]:
# Split into features and targets
dep_var = 'Rain' # Will be the target
x_values = df_shuffled.drop(dep_var, axis=1)
y_values = df_shuffled[dep_var]

In [6]:
# Split into training and test data
x_train, x_test, \
y_train, y_test = train_test_split(x_values, y_values, 
                                   test_size=0.33, random_state=42)


In [7]:
# Create logistic regression model and fit model to training data
lgr_model = LogisticRegression(max_iter=500)

lgr_model.fit(x_train, y_train)
print(lgr_model.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [8]:
# Get intercept and coefficients
intercept = lgr_model.intercept_
coefficients = lgr_model.coef_

# Convert to list
coef_list = coefficients[0,:]

In [9]:
# Match coefficients to their fields
coef_df = pd.DataFrame({'Feature': list(x_train.columns),
                        'Coefficient': coef_list
                       })
coef_df

Unnamed: 0,Feature,Coefficient
0,Temperature_c,17.972571
1,Humidity,-0.111956
2,Wind_Speed_kmh,0.662177
3,Wind_Bearing_degrees,0.159159
4,Visibility_km,0.786989
5,Pressure_millibars,0.125645
6,Description_Normal,3.590062
7,Description_Warm,0.648883


In [10]:
# Get predicted probabilities of rain based on test data
predicted_prob = lgr_model.predict_proba(x_test)[:,1]

# Get predicted classes
predicted_class = lgr_model.predict(x_test)


***Note: DataFrame.append() is deprecated. Create a DataFrame and transpose it, then use pandas.concat()***


In [11]:
# Evaluate performance of the model by using a confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, predicted_class))
cm['Total'] = np.sum(cm, axis=1)
col_sums = pd.DataFrame(np.sum(cm, axis=0)).T
cm = pd.concat([cm, col_sums], ignore_index=True)

# Column names and set indices
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm.set_index([['Actual No', 'Actual Yes', 'Total']], inplace=True)
cm

Unnamed: 0,Predicted No,Predicted Yes,Total
Actual No,291,92,383
Actual Yes,6,2911,2917
Total,297,3003,3300


In [12]:
# Generate classification report
print(classification_report(y_test, predicted_class))

              precision    recall  f1-score   support

         0.0       0.98      0.76      0.86       383
         1.0       0.97      1.00      0.98      2917

    accuracy                           0.97      3300
   macro avg       0.97      0.88      0.92      3300
weighted avg       0.97      0.97      0.97      3300



In [13]:
# Create a grid search to determine the best values for hyperparams
grid = {'penalty': ['l1', 'l2'],
        'C': np.linspace(1, 10, 10),
        'solver': ['liblinear']
       }

# Create model with best f1 score possible from grid
# Note: cv=5 is the default
# Isn't solver='liblinear' redundant if included in grid?
lgr_grid_model = GridSearchCV(LogisticRegression(max_iter=500),
                              grid, scoring='f1', cv=5)

# Fit model
lgr_grid_model.fit(x_train, y_train)
print('Done\n')
lgr_grid_model

Done



In [14]:
# View best parameters
best_params = lgr_grid_model.best_params_
best_params

{'C': 4.0, 'penalty': 'l1', 'solver': 'liblinear'}