## Import Libraries

In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
import os
%matplotlib inline

In [97]:
import warnings

warnings.filterwarnings('ignore')

## Import dataset

In [98]:

data = '/Users/glebteperev/Desktop/IE/MACHINE LEARNING AND ARTIFICIAL INTELLIGENCE/weatherAUS.csv'

df = pd.read_csv(data)

## DATA PREPROCESSING

In [99]:
if 'RISK_MM' in df.columns:
    df.drop(['RISK_MM'], axis=1, inplace=True)

In [100]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop('Date', axis=1, inplace=True)

In [101]:
numerical_features = [col for col in df.columns if df[col].dtype != 'O']
for feature in numerical_features:
    df[feature] = df[feature].fillna(df[feature].median())

In [102]:
categorical_features = [col for col in df.columns if df[col].dtype == 'O']
for feature in categorical_features:
    df[feature] = df[feature].fillna('missing')

In [103]:
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

## Declare feature vector and target variable

In [104]:
X = df.drop(['RainTomorrow_Yes'], axis=1)
y = df['RainTomorrow_Yes']


## SPLITTING THE DATA

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


## FEATURE SCALING

In [106]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Training

In [107]:
logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(X_train_scaled, y_train)

## Predict results

In [108]:
y_pred = logreg.predict(X_test_scaled)

## Accuracy score

In [109]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.8508


## Confussion Matrix

In [111]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)

Confusion matrix

 [[21526  1200]
 [ 3141  3225]]


## Classification report

In [113]:
print('\n', classification_report(y_test, y_pred))


               precision    recall  f1-score   support

           0       0.87      0.95      0.91     22726
           1       0.73      0.51      0.60      6366

    accuracy                           0.85     29092
   macro avg       0.80      0.73      0.75     29092
weighted avg       0.84      0.85      0.84     29092



## ROC-AUC

In [114]:
y_pred_proba = logreg.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print('ROC AUC score: {0:0.4f}'.format(roc_auc))

ROC AUC score: 0.8757


## Hyperparameter Optimization using GridSearch CV

In [117]:
# It took too much time to load, so I used only l2 penalty
parameters = {
    'penalty': ['l2'],  
    'C': [0.1, 1, 10]  
}

logreg = LogisticRegression(solver='liblinear', max_iter=1000, random_state=0)

#here i set cv=3 so it was not that hard to analyse for the machine

grid_search = GridSearchCV(estimator=logreg, param_grid=parameters, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print('GridSearch CV best score: {:.4f}'.format(grid_search.best_score_))
print('Parameters that give the best results:', grid_search.best_params_)
print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(X_test_scaled, y_test)))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


GridSearch CV best score: 0.8518
Parameters that give the best results: {'C': 10, 'penalty': 'l2'}
GridSearch CV score on test set: 0.8513


So, In my case the logistic regression model was trained with hyperparameter optimization using GridSearchCV to find the best parameters for 'penalty' and 'C', improving my model's accuracy and generalization capability.

The model achieved an accuracy score of 0.8508 on the test data. Further I evaluated the model using a confusion matrix, classification report, and ROC AUC score to understand its performance better.

To check for overfitting, I compared the model's performance on the training set against the test set. A significant discrepancy would indicate overfitting. However, the model showed comparable performance on both sets, suggesting it generalizes well to unseen data.

The logistic regression model managed to learn from the data, evidenced by its ability to outperform a naive guess based on the most common label. It means that the model has identified meaningful patterns in the data related to rain prediction.

The ROC AUC score supports the model's capability to distinguish between the classes effectively.

Through hyperparameter optimization, I found that a C value of X and penalty of l2 gave the best results, indicating the importance of regularization in preventing overfitting and enhancing model performance.

The logistic regression model successfully learned to predict rain for the next day with reasonable accuracy, demonstrating the potential of machine learning in weather prediction. This exercise made me understand the importance of the balance between model complexity and generalization.

