In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# setting seaborn theme
sns.set_theme(style='whitegrid', palette='ch:.25')

<img src='https://t1.daumcdn.net/cfile/tistory/99B3263359928F0F30' width='400'>

# 1. Load Data & Check Information

In [None]:
ori = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

### Checking Basic Information

In [None]:
df.info()

In [None]:
df.isnull().mean()

In [None]:
df.nunique()

### Checking Correlationship

In [None]:
correlation = df.corr()
plt.figure(figsize=(18,18))
sns.heatmap(correlation, annot=True, square=True)
plt.show()

### Null Values in each column

In [None]:
df.isna().sum()

# 2. Data Engineering

### Dropping Some Columns

- Since `Evaporation`, `Sunshine`, `Cloud9am`, `Cloud3pm` contain null values more than 30%, it will not help to improve out modes so they will be dropped.
- `Date` information does not needed, so this will be also dropped.

In [None]:
df = df.drop(['Date','Evaporation','Sunshine','Cloud9am','Cloud3pm'], axis=1)

In [None]:
df.columns

### Handling Null Values

Handling Null Values is really important to predict the answer. <br/>
If there is null values in dataset, machine learning model will not accept to fit dataset into model.<br/> Also, it will lead to wrong prediction at the last. 
In this section, I changed null values by using 2 steps.

1. **Predict null values**<br/> 
Just replacing null values to median or mean would not increase the accuracy of model.<br/> Therefore, by using `KNeighborsRegressor`, I am going to predict missing values as much as possible.

2. **Replace to median**<br/>
After predicting null values, I am going to replace it by using `SimpleImputer`.<br/>


KNN model does not receive **object** type values, so object type values must be replaced to interger of float type.

In [None]:
cat_list = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm','RainToday', 'RainTomorrow']
for column in cat_list:
    df[column] = pd.Categorical(df[column])
    df[column] = df[column].cat.codes
    #-1 represent NaN in .cat.codes. Therefore, I replaced to real NaN value
    df[column].replace(-1, np.NaN, inplace=True)

First step will be executed by `filling_null` function. <br/><br/>
Things going on in `filling_null` :
* Based on the given feature, it drops all the null variable in other features.
* Split dataset into train and test set. Train set only include non-null values for given feature, and test set only include null values for given feature.
* Run KNN model to predict null values
* Return Dataset

In [None]:
def filling_null(feature, df=df):
    
    #make train set and test set
    temp_df = df.copy().drop('RainTomorrow', axis=1)
    df_list = list(temp_df.columns)
    df_list.remove(feature)
    temp_df.dropna(subset=df_list, inplace=True)
    train = temp_df.loc[temp_df.notna()[feature]]
    train_x = train.drop(feature, axis=1)
    train_y = train[feature]
    test = temp_df[temp_df.isnull()[feature]].drop(feature,axis=1)    

    #run machine learning model and predict null values
    KNN = KNeighborsRegressor(n_jobs=-1)
    KNN.fit(train_x, train_y)
    change_NaN = KNN.predict(test)
    index_list = test.index.tolist()
    for i in range(len(change_NaN)):
        df.at[index_list[i], feature]= change_NaN[i]

    #return dataset which had been changed
    return df

Reason why I did not apply all the columns to `filling_null` function is that features not in the list, `Rainfall`, `WindGustSpeed`, `WindDir9am`, `WindDir3pm`, `Temp9am`, `Temp3pm`, `RainToday`, will have empty dataset if I drop null values from other features.

In [None]:
apply_list =['MinTemp', 'MaxTemp', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Humidity9am',
             'Humidity3pm', 'Pressure9am', 'Pressure3pm']


for feature in apply_list:
    df = filling_null(feature = feature)

As you can see, `WindDir9am` feature has changed a lot! <br> Other features also changed, but not like `WindDir9am`. However, it is still worth it

In [None]:
#getting information of number of null variable changed
df_columns = list(df.columns)
changed_dict = {}
for col in df_columns:
    changed_dict["%s" %col] = len(df[col].dropna()) - len(ori[col].dropna())

#delet features which did not changed at all
pop_list = ['Location','Rainfall','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Temp9am','Temp3pm','RainToday','RainTomorrow']
for feature in pop_list:
    changed_dict.pop(feature)

#make list of key and value to visualize the graph
key_list = []
value_list = []
for key, value in changed_dict.items():
    key_list.append(key)
    value_list.append(value)

temp_df = pd.DataFrame()
temp_df['key'] = key_list
temp_df['value'] = value_list

#visualization
plt.figure(figsize=(25, 10))
plot = sns.barplot(x='key',y='value', data=temp_df)
for p in plot.patches:
    plot.annotate(format(p.get_height(), '0.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.yscale('log')
plot.axes.get_yaxis().set_visible(False)
plt.title('# of null values which changed to non-null values', fontsize=20)
plt.show()

Since we finished first step, we are going to do second step, which is **Replace to median**.

In [None]:
df_Xnul = df.fillna(df.median())

In [None]:
df_Xnul.info()

# 3. Train & Test set
Before moving on, we need to make train and test set.

In [None]:
X = df_Xnul.drop(['RainTomorrow'], axis=1)
y = df_Xnul['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# 4. Model Selection & GridSearch
Now it's time to select best model and do some hyperparameter tuning!

In [None]:
classifier_names = ["Logistic Regression",'SGDClassifier', "Random Forest","KNN","Decision"]

classifiers = [LogisticRegression(), SGDClassifier(), RandomForestClassifier(), KNeighborsClassifier(), DecisionTreeClassifier()]

zipped_clf = zip(classifier_names,classifiers)

In [None]:
def classifier(classifier, t_train, c_train, t_test, c_test):
    result = []
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('standardize', StandardScaler()),                         
            ('classifier', c)
        ])
        print("Validation result for {}".format(n))
        print(c)
        clf_acc = fit_classifier(checker_pipeline, t_train, c_train, t_test,c_test)
        result.append((n,clf_acc))
    return result

In [None]:
def fit_classifier(pipeline, X_train, y_train, X_test, y_test):
    model_fit = pipeline.fit(X_train, y_train)
    y_pred = model_fit.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print()
    return accuracy

By applying 5 machine learning models, `Random Forest` gained highest accuracy score(85.29%) with default hyperparameter. </br>
Random Forest's hyperparameters will be tuned to increase the accuracy score little bit more.

In [None]:
result = classifier(zipped_clf, X_train, y_train, X_test, y_test)

In [None]:
param_list = {'min_samples_leaf':[3,6,9], 'max_depth': [5,10,None], 'criterion' : ['gini', 'entropy']}
RFC = RandomForestClassifier(n_jobs=-1, random_state=42)
clf = GridSearchCV(RFC,param_list)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.best_estimator_.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)*100

# Reference

* https://towardsdatascience.com/automate-the-machine-learning-model-implementation-with-sklearn-pipeline-2ef1389062c9