In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Load data from weather.csv into a dataframe
df = pd.read_csv('weather.csv')

# Encode the categorical columns (Description) with n-1 columns
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies

Unnamed: 0,Temperature_c,Humidity,Wind_Speed_kmh,Wind_Bearing_degrees,Visibility_km,Pressure_millibars,Rain,Description_Normal,Description_Warm
0,-0.555556,0.92,11.2700,130,8.0500,1021.60,0,0,0
1,21.111111,0.73,20.9300,330,16.1000,1017.00,1,0,1
2,16.600000,0.97,5.9731,193,14.9086,1013.99,1,1,0
3,1.600000,0.82,3.2200,300,16.1000,1031.59,1,0,0
4,2.194444,0.60,10.8836,116,9.9820,1020.88,1,0,0
...,...,...,...,...,...,...,...,...,...
9995,10.022222,0.95,10.2396,20,4.0089,1007.41,1,1,0
9996,8.633333,0.64,11.0446,80,9.9820,1031.33,1,1,0
9997,5.977778,0.93,11.0446,269,14.9086,1014.21,1,1,0
9998,9.788889,0.78,8.1788,231,7.8246,1005.02,1,1,0


In [3]:
# Shuffle the data
df_shuffled = shuffle(df_dummies, random_state=42)

In [4]:
# Split into features and targets
dep_var = 'Rain' # Will be the target
x_values = df_shuffled.drop(dep_var, axis=1)
y_values = df_shuffled[dep_var]

In [5]:
# Split into training and test data
x_train, x_test, \
y_train, y_test = train_test_split(x_values, y_values, 
                                   test_size=0.33, random_state=42)


In [6]:
# Create logistic regression model and fit model to training data
lgr_model = LogisticRegression(max_iter=500)

lgr_model.fit(x_train, y_train)
print(lgr_model.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [7]:
# Get intercept and coefficients
intercept = lgr_model.intercept_
coefficients = lgr_model.coef_

# Convert to list
coef_list = coefficients[0,:]

In [8]:
# Match coefficients to their fields
coef_df = pd.DataFrame({'Feature': list(x_train.columns),
                        'Coefficient': coef_list
                       })
coef_df

Unnamed: 0,Feature,Coefficient
0,Temperature_c,6.145421
1,Humidity,-0.318383
2,Wind_Speed_kmh,-0.068037
3,Wind_Bearing_degrees,-0.002393
4,Visibility_km,0.055944
5,Pressure_millibars,0.001102
6,Description_Normal,0.099392
7,Description_Warm,0.048778


In [9]:
# Get predicted probabilities of rain based on test data
predicted_prob = lgr_model.predict_proba(x_test)[:,1]

# Get predicted classes
predicted_class = lgr_model.predict(x_test)


***Note: DataFrame.append() is deprecated. Create a DataFrame and transpose it, then use pandas.concat()***


In [10]:
# Evaluate performance of the model by using a confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, predicted_class))
cm['Total'] = np.sum(cm, axis=1)
col_sums = pd.DataFrame(np.sum(cm, axis=0)).T
cm = pd.concat([cm, col_sums], ignore_index=True)

# Column names and set indices
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm.set_index([['Actual No', 'Actual Yes', 'Total']], inplace=True)
cm

Unnamed: 0,Predicted No,Predicted Yes,Total
Actual No,377,6,383
Actual Yes,10,2907,2917
Total,387,2913,3300


In [11]:
# Generate classification report
print(classification_report(y_test, predicted_class))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       383
           1       1.00      1.00      1.00      2917

    accuracy                           1.00      3300
   macro avg       0.99      0.99      0.99      3300
weighted avg       1.00      1.00      1.00      3300

