<a href="https://colab.research.google.com/github/ghadaalhabib/Machine-Leanring-Projects/blob/main/rain_prediction_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rain Prediction Classifier



### The following classifiers are used:
1- Linear Regression

2- KNN

3- Decision Trees

4- Logistic Regression

5- Support Vector Machine (SVM)

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [None]:
# Import the dataset
df = pd.read_csv('sydney_weather.csv')
df.head()

# Drop the unnecessary columns
df.drop('Date',axis=1,inplace=True)

# Save the features in a numpy array
header_array = df.columns.to_numpy()
print(header_array)

In [4]:
all_metrics = pd.DataFrame()

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score, roc_auc_score, average_precision_score, mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

def metrics_dicts(y_test, predictions, model_name, metrics_df=None):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    jaccard_index = jaccard_score(y_test, predictions)
    
    if 'LinearRegression' in model_name:
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        metrics_dict = {'Model': [model_name],
                        'MAE': [mae],
                        'MSE': [mse],
                        'R2': [r2]
                       }
    elif 'Logistic Regression Model' in model_name:
        roc_auc = roc_auc_score(y_test, predictions)
        pr_auc = average_precision_score(y_test, predictions)
        metrics_dict = {'Model': [model_name],
                        'Accuracy': [accuracy], 
                        'Precision score': [precision], 
                        'Recall': [recall], 
                        'F1 Score': [f1],
                        'Jaccard index': [jaccard_index],
                        'roc_auc_score': [roc_auc],
                        'Average_precision_score': [pr_auc]
                       }
    else:
        metrics_dict = {'Model': [model_name],
                        'Accuracy': [accuracy], 
                        'Precision score': [precision], 
                        'Recall': [recall], 
                        'F1 Score': [f1],
                        'Jaccard index': [jaccard_index]
                       }

    if metrics_df is None:
        metrics_df = pd.DataFrame(metrics_dict)
    else:
        new_df = pd.DataFrame(metrics_dict)
        metrics_df = pd.concat([metrics_df, new_df], ignore_index=True)


    return metrics_df

# Data Preprocessing 


## Handle the missing data
1- Rows with a small number of missing values: Drop the following rows: MinTemp, MaxTemp, Temp9am, and Temp3pm. 

2- For the target variable, remove the rows esp. because its only 20 rows.

2- For columns with a larger number of missing values, you may use: 
  1) Mean imputation
  2) Median imputation
  3) Multiple Imputation by Chained Equations (MICE)
  4) K-Nearest Neighbors (KNN) imputation
  5) MICE imputation 
  6) Mean imputation with additional variable
  7) Mode imputation

In [None]:
missing_rows = df.isnull().sum()
print(missing_rows)
print(df.info())

In [None]:
# Step 1:
# Rows with a small number of missing values: Drop the following rows: MinTemp, MaxTemp, Temp9am, and Temp3pm.
# For the target variable, remove the rows esp. because its only 20 rows.
df = df.dropna(subset=['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm', 'RainTomorrow'])
print(df.shape)

In [None]:
# Step 2: FOR FUTURE REFERENCE - if you would like to experiement with other traditional impuation methods
# We will not be implenting this
# MICE imputation 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=0)
df['Evaporation'] = imputer.fit_transform(df[['Evaporation']])


In [None]:
# Mediun imputation
# We will not be implenting this
median_evaporation = df['Evaporation'].median()
df['Evaporation'] = df['Evaporation'].fillna(median_evaporation)

In [None]:
# Mean imputation
# We will not be implenting this
mean_evaporation = df['Evaporation'].mean()
df['Evaporation'] = df['Evaporation'].fillna(median_evaporation)

In [None]:
# Mean imputation with additional variable
# We will not be implenting this
evap_by_location = df.groupby('RainToday')['Evaporation'].mean()
for loc in df['RainToday'].unique():
    mask = (df['RainToday'] == loc) & (df['Evaporation'].isnull())
    df.loc[mask, 'Evaporation'] = evap_by_location[loc]


In [8]:
# OPTION CONSIDERING WE HAVE A RELATIVELY SMALL NUMBER OF MISSING VALUES AND A SMALL DATASET - to handle numerical data
# KNN imputation
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
for feature in header_array:
  if(df[feature].isna().any() and df[feature].dtype == 'float64' and feature != ''):
    df[feature] = imputer.fit_transform(df[[feature]])

In [9]:
# OPTION CONSIDERING WE HAVE A RELATIVELY SMALL NUMBER OF MISSING VALUES AND A SMALL DATASET - to handle categorical data
# Mode imputation
for feature in header_array:
  mode_a = df[feature].mode()[0]
  if(df[feature].isna().any() and df[feature].dtype == 'object'):
    df[feature] = df[feature].fillna(mode_a)

## Convert categorical data to numerical data 
Some classifiers need to be encoded as numerical values first


In [10]:
# Perform hot encoding
df = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
# replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. 
# We do not use the get_dummies method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.
df.replace(['No', 'Yes'], [0,1], inplace=True)

In [11]:
# Convert the data to float
df = df.astype(float)

In [None]:
df.info()

## Splitting the data

In [12]:
# Split the data into predictures and target
predictors = df.drop(columns='RainTomorrow', axis=1)
target = df['RainTomorrow']

In [13]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=10)
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

(3988, 67) (3988,)
(997, 67) (997,)


# Testing and training

### Linear Regression model
This is not a good option considering that it is a binary classification problem. However, the code has been added below for future references along with the appropriate metrics to measure the accuracy of the classifier.



In [None]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train)
predictions = regr.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'LinearRegression', all_metrics)

### KNN model
Note that if it was a regression problem we would use KNeighborsRegressor


In [15]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors=4)
KNN.fit(X_train, y_train)
predictions = KNN.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'KNN model', all_metrics)

### Desision Tree model

In [16]:
from sklearn.tree import DecisionTreeRegressor

Tree = DecisionTreeRegressor()
Tree.fit(X_train, y_train)
predictions = Tree.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Desision Tree model', all_metrics)

### Logistic Regression Model

In [17]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, y_train)
predictions = LR.predict(X_test)
predict_proba = LR.predict_proba(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Logistic Regression Model', all_metrics)

### Support Vector Machines (SVM)
Try with different kernels to check the perform

In [18]:
from sklearn.svm import SVC

SVM = SVC(kernel='linear')
SVM.fit(X_train, y_train)
predictions = SVM.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Support Vector Machines linear', all_metrics)

In [19]:
SVM = SVC(kernel='poly')
SVM.fit(X_train, y_train)
predictions = SVM.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Support Vector Machines poly', all_metrics)

In [20]:
SVM = SVC(kernel='rbf')
SVM.fit(X_train, y_train)
predictions = SVM.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Support Vector Machines rbf', all_metrics)

In [21]:
SVM = SVC(kernel='sigmoid')
SVM.fit(X_train, y_train)
predictions = SVM.predict(X_test)

all_metrics = metrics_dicts(y_test, predictions, 'Support Vector Machines sigmoid', all_metrics)

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print(all_metrics)

                             Model  Accuracy  Precision score    Recall  \
0                        KNN model  0.874624         0.943038  0.562264   
1              Desision Tree model  1.000000         1.000000  1.000000   
2        Logistic Regression Model  0.991976         0.996139  0.973585   
3   Support Vector Machines linear  0.994985         0.992424  0.988679   
4     Support Vector Machines poly  0.766299         1.000000  0.120755   
5      Support Vector Machines rbf  0.752257         1.000000  0.067925   
6  Support Vector Machines sigmoid  0.734203         0.000000  0.000000   

   F1 Score  Jaccard index  roc_auc_score  Average_precision_score  
0  0.704492       0.543796            NaN                      NaN  
1  1.000000       1.000000            NaN                      NaN  
2  0.984733       0.969925       0.986109                 0.976847  
3  0.990548       0.981273            NaN                      NaN  
4  0.215488       0.120755            NaN             