In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import warnings # suppress warnings
warnings.filterwarnings('ignore') # suppress warnings

In [3]:
# Import functions from the package "tools"(created by Y. Kostrov)

In [4]:
from tools import fill_missing_values, create_month, \
print_missing ,assessClassifier, testClassifier, \
modify_data

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
df = pd.read_csv('../data/weatherAUS.csv')

In [7]:
df.RainTomorrow.dropna(axis=0,inplace=True)

In [8]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.num_cols_ = None
        self.scaler_ = None
        
    def fit(self, X, y=None):
        self.num_cols_ =[column for column in X_train.columns if X_train[column].dtype == 'float64']
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.num_cols_])

        return self
    
    def transform(self, X, y=None):
        X[self.num_cols_] = self.scaler.transform(X[self.num_cols_]) # fit and transform the data
        
        return X

In [9]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.loc_month_medians_ = {}
        self.month_medians_ = {}
        self.num_cols_ = None
        self.str_cols_ = None
        
    def fit(self, X, y=None):
        self.num_cols_ = [column for column in X.columns if X[column].dtype == 'float64']
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        self.loc_month_medians_ = X.groupby(['Location','Month']).median()
        self.month_medians_ = X.groupby(['Month']).median()
        return self
    
    def transform(self, X, y=None):
        print("Transforming Data Set")
        for location in X.Location.unique():
            for month in X.Month.unique():
                for column in self.num_cols_:
                    median_for_month = self.loc_month_medians_.loc[(location,month)][column]
                    if np.isnan(median_for_month):
                        median_for_month = self.month_medians_.loc[month][column]
                    idx = list(X[(X.Location == location) & (X.Month == month) & (X[column].isna())].index)
                    X.loc[idx,column] = median_for_month
        
        return X

In [10]:
class CustomCreateMonth(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['Month'] = X.Date.apply(lambda x : int(x.split('-')[1]))
        X = X.drop('Date', axis=1)
        
        return X

In [11]:
class CustomFixLocation(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.lb_ = None
    
    def fit(self, X, y=None):
        self.lb_ = LabelEncoder()
        self.lb_.fit(X['Location'])
        
        return self
    
    def transform(self, X, y=None):
        X['Location'] = self.lb_.transform(X['Location'])
        
        return X
    

In [12]:
class CustomCategoryImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.str_cols_ = None
    
    def fit(self, X, y=None):
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        return self
    
    def transform(self, X, y=None):
        for column in self.str_cols_:
            idx = X[X[column].isna()].index
            X.loc[idx,column] ='MIA'
        return X

In [13]:
class CustomCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.enc_ = None
        self.str_cols_ = None
        self.num_cols_ = None
    
    def impute(self, X_str):
        imputer = CustomCategoryImputer()
        imputer.fit(X_str)
        X_str = imputer.transform(X_str)
        return X_str
    
    def fit(self, X, y=None):
        self.num_cols_ = [column for column in X.columns if X[column].dtype == 'float64']
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        self.enc_ = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.enc_.fit(X[self.str_cols_])
        
        return self
    
    def transform(self, X, y=None):
        X_str = X[list(self.str_cols_)]
        X_num = X[list(self.num_cols_)]
        X_str = self.enc_.transform(X_str)
        column_names = self.enc_.get_feature_names(self.str_cols_)
        for i, col in enumerate(column_names):
            X_num[col] = X_str[:,i]
        
        X = X_num
        return X

In [14]:
def convert_to_numeric(val):
    if val == "No":
        return 0
    else:
        return 1

In [15]:
df = pd.read_csv('../data/weatherAUS.csv')
idx = df[df.RainTomorrow.isna()].index
df.drop(idx, axis=0,inplace=True)
df.sample(frac=1,random_state=10)
X_t = df.drop('RainTomorrow',axis=1) # Create the data without target column
y_t = df.RainTomorrow # assign the target column
X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.25, random_state=0) # split the data into train/test datasets

In [16]:
y_train = y_train.apply(convert_to_numeric)
y_test = y_test.apply(convert_to_numeric)

In [17]:
y_test.value_counts()

0    27543
1     8006
Name: RainTomorrow, dtype: int64

In [18]:
y_train.value_counts()

0    82773
1    23871
Name: RainTomorrow, dtype: int64

In [19]:
pipeline = Pipeline(steps=[
    ('create_month', CustomCreateMonth()),
    ('create_loc_number', CustomFixLocation()),
    ('num_imputer', CustomImputer()),
    ('num_scaler', CustomScaler()),
    ('cat_imputer', CustomCategoryImputer()),
    #('oneHE', OneHotEncoder(handle_unknown='ignore')),
    ('cat_encoder', CustomCategoryEncoder())
])


In [20]:
full_pipeline = Pipeline(steps=[
    ('pip',pipeline),
    ('clf',RandomForestClassifier())
])

In [28]:
#full_pipeline.fit(X_train,y_train)

In [29]:
#y_pred = full_pipeline.predict(X_test)
#confusion_matrix(y_test, y_pred)

In [30]:
params={'clf__max_features':[0.3, 0.5],
        'clf__min_samples_leaf':[1, 2],
        'clf__max_depth':[None]
        }

In [31]:
jobs = -1
RF = GridSearchCV(estimator=full_pipeline,
            param_grid=params,
            scoring='f1',cv=2,
            verbose=1,
            n_jobs=jobs,
                 )

In [32]:
RF.fit(X_train,y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
Transforming Data Set


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('pip',
                                        Pipeline(steps=[('create_month',
                                                         CustomCreateMonth()),
                                                        ('create_loc_number',
                                                         CustomFixLocation()),
                                                        ('num_imputer',
                                                         CustomImputer()),
                                                        ('num_scaler',
                                                         CustomScaler()),
                                                        ('cat_imputer',
                                                         CustomCategoryImputer()),
                                                        ('cat_encoder',
                                                         CustomCategoryEncoder())])),
                    

In [33]:
y_pred = RF.predict(X_test)

Transforming Data Set


In [34]:
y_pred = RF.predict(X_test)

Transforming Data Set


In [35]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91     27543
           1       0.77      0.53      0.63      8006

    accuracy                           0.86     35549
   macro avg       0.82      0.74      0.77     35549
weighted avg       0.85      0.86      0.85     35549

[[26268  1275]
 [ 3764  4242]]
