In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import warnings # suppress warnings
warnings.filterwarnings('ignore') # suppress warnings

In [2]:
# Import functions from the package "tools"(created by Y. Kostrov)

In [3]:
from tools import fill_missing_values, create_month, \
print_missing ,assessClassifier, testClassifier, \
modify_data, convert_number_to_month

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('../data/weatherAUS.csv')

In [5]:
df.RainTomorrow.dropna(axis=0,inplace=True)

In [6]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.num_cols_ = None
        self.scaler_ = None
        
    def fit(slef, X, y=None):
        self.num_cols_ =[column for column in X_train.columns if X_train[column].dtype == 'float64']
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.num_cols_])

        return self
    
    def transform(self, X, y=None):
        X[self.num_cols_] = self.scaler.transform(X[num_columns]) # fit and transform the data
        
        return X

In [7]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.loc_month_medians_ = {}
        self.month_medians_ = {}
        self.num_cols_ = None
        self.str_cols_ = None
        
    def fit(self, X, y=None):
        self.num_cols_ = [column for column in X.columns if X[column].dtype == 'float64']
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        self.loc_month_medians_ = X.groupby(['Location','Month']).median()
        self.month_medians_ = X.groupby(['Month']).median()
        return self
    
    def transform(self, X, y=None):
        print("Transforming Data Set")
        for location in X.Location.unique():
            for month in X.Month.unique():
                for column in self.num_cols_:
                    median_for_month = self.loc_month_medians_.loc[(location,month)][column]
                    if np.isnan(median_for_month):
                        median_for_month = self.month_medians_.loc[month][column]
                    idx = list(X[(X.Location == location) & (X.Month == month) & (X[column].isna())].index)
                    X.loc[idx,column] = median_for_month
        
        return X

In [8]:
class CustomCreateMonth(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['Month'] = X.Date.apply(lambda x : int(x.split('-')[1]))
        X = X.drop('Date', axis=1)
        
        return X

In [9]:
class CustomFixLocation(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.lb_ = None
    
    def fit(self, X, y=None):
        self.lb_ = LabelEncoder()
        self.lb_.fit(X['Location'])
        
        return self
    
    def transform(self, X, y=None):
        X['Location'] = self.lb_.transform(X['Location'])
        
        return X
    

In [10]:
class CustomCategoryImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.str_cols_ = None
    
    def fit(self, X, y=None):
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        return self
    
    def transform(self, X, y=None):
        for column in self.str_cols_:
            idx = X[X[column].isna()].index
            X.loc[idx,column] ='MIA'
        return X

In [11]:
class CustomCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.enc_ = None
        self.str_cols_ = None
    
    def impute(self, X_str):
        imputer = CustomCategoryImputer()
        imputer.fit(X_str)
        X_str = imputer.transform(X_str)
        return X_str
    
    def fit(self, X, y=None):
        self.str_cols_ = [column for column in X.columns if X[column].dtype == 'object']
        self.enc_ = OneHotEncoder(sparse=False)
        X_str = X[list(self.str_cols_)]
        X_str = self.impute(X_str)
        self.enc_.fit(X_str)
        return self
    
    def transform(self, X, y=None):
        X_str = X[list(self.str_cols_)]
        X_str = self.enc_.transform(X_str)
        column_name = self.enc_.get_feature_names(self.str_cols_)
        one_hot_encoded_frame =pd.DataFrame(X_str, columns= column_name)
        l2 = list(one_hot_encoded_frame.reset_index().columns)
        l2[0] = 'index2'
        cols = list(X.reset_index().columns) + l2
        result = pd.concat([X.reset_index(),one_hot_encoded_frame.reset_index()],axis=1,ignore_index=True)
        result.columns = cols
        result.drop('index2', axis=1, inplace=True)
        result.set_index('index')
        return result

In [12]:
def convert_to_numeric(val):
    if val == "No":
        return 0
    else:
        return 1

In [13]:
df = pd.read_csv('../data/weatherAUS.csv')
idx = df[df.RainTomorrow.isna()].index
df.drop(idx, axis=0,inplace=True)
df.sample(frac=1,random_state=10)
X_t = df.drop('RainTomorrow',axis=1) # Create the data without target column
y_t = df.RainTomorrow # assign the target column
X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.25, random_state=0) # split the data into train/test datasets

In [14]:
y_train = y_train.apply(convert_to_numeric)
y_test = y_test.apply(convert_to_numeric)

In [16]:
y_test.value_counts()

0    27543
1     8006
Name: RainTomorrow, dtype: int64

In [17]:
y_train.value_counts()

0    82773
1    23871
Name: RainTomorrow, dtype: int64

In [65]:
pipeline = Pipeline(steps=[
    ('create_month', CustomCreateMonth()),
    ('create_loc_number', CustomFixLocation()),
    ('num_imputer', CustomImputer()),
    ('num_scaler', CustomScaler()),
    ('cat_imputer', CustomCategoryImputer()),
    ('oneHE', OneHotEncoder(handle_unknown='ignore')),
])
"""
pipeline = Pipeline(steps=[
    ('create_month', CustomCreateMonth()),
    ('create_loc_number', CustomFixLocation()),
    ('num_imputer', CustomImputer()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
"""

"\npipeline = Pipeline(steps=[\n    ('create_month', CustomCreateMonth()),\n    ('create_loc_number', CustomFixLocation()),\n    ('num_imputer', CustomImputer()),\n    ('imputer', SimpleImputer(strategy='most_frequent')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))\n])\n"

In [59]:
full_pipeline = Pipeline(steps=[
    ('pip',pipeline),
    ('clf',RandomForestClassifier())
])

In [32]:
full_pipeline.fit(X_train,y_train)

Transforming Data Set


Pipeline(steps=[('pip',
                 Pipeline(steps=[('create_month', CustomCreateMonth()),
                                 ('create_loc_number', CustomFixLocation()),
                                 ('num_imputer', CustomImputer()),
                                 ('cat_imputer', CustomCategoryImputer()),
                                 ('oneHE',
                                  OneHotEncoder(handle_unknown='ignore'))])),
                ('log', LogisticRegression())])

In [33]:
y_pred = full_pipeline.predict(X_test)

Transforming Data Set


In [60]:
param_grid_rf = [{'n_estimators':[3,5,10,20,30,50,80,120],
    'max_features':['auto'],
    'max_depth' : [5,10,20, 30, 40, 50 ,60 , 70],
    'criterion':['gini', 'entropy']},]
params={'clf__max_features':[0.3, 0.5, 0.7],
        'clf__min_samples_leaf':[1, 2, 3],
        'clf__max_depth':[None]
        }

In [63]:
jobs = -1
RF = GridSearchCV(estimator=full_pipeline,
            param_grid=params,
            scoring='f1',cv=3,
            verbose=1,
            n_jobs=jobs,
                 )

In [64]:
RF.fit(X_train,y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Exception in thread Thread-9:
Traceback (most recent call last):
  File "/Users/yevgeniykostrov/opt/anaconda3/envs/learn-env/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/Users/yevgeniykostrov/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 567, in run
    self.flag_executor_shutting_down()
  File "/Users/yevgeniykostrov/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 756, in flag_executor_shutting_down
    self.kill_workers()
  File "/Users/yevgeniykostrov/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 766, in kill_workers
    recursive_terminate(p)
  File "/Users/yevgeniykostrov/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_term

In [184]:
y_pred = RF.predict(X_test)

In [189]:
y_pred = RF.predict(X_test)

In [190]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     27543
           1       0.23      1.00      0.37      8006

    accuracy                           0.23     35549
   macro avg       0.11      0.50      0.18     35549
weighted avg       0.05      0.23      0.08     35549

[[    0 27543]
 [    0  8006]]
