<a href="https://colab.research.google.com/github/chefdarek/DS-Unit-2-Classification-1/blob/master/module3-random-forests/RandomForestandFriends.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
!pip install category_encoders
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline

pd.set_option('display.float_format', '{:.2f}'.format)

train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

print('train_features', train_features.shape)
print('train_labels', train_labels.shape)
print('test_features', test_features.shape)
print('sample_submission', sample_submission.shape)

train_features (59400, 40)
train_labels (59400, 2)
test_features (14358, 40)
sample_submission (14358, 2)


In [None]:
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    # Drop recorded_by (never varies) and id (always varies, random)
    X = X.drop(columns=['recorded_by', 'id'])
    
    # Drop duplicate columns
    duplicate_columns = ['quantity_group']
    X = X.drop(columns=duplicate_columns)
    
    #LONG LAT to NAN
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    #GPS height
    X['gps_height'] = X['gps_height'].replace(0, X['gps_height'].mean() )
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # For categoricals with missing values, fill with the category 'MISSING'
    categoricals = X.select_dtypes(exclude='number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
    
    return X


In [None]:
train_features = wrangle(train_features)
test_features = wrangle(test_features)

In [None]:
#train_features.isna().sum()

In [None]:
drop = ["funder",
        "ward",
]
train_features.drop(drop, axis=1, inplace=True)
test_features.drop(drop, axis=1, inplace=True)

In [6]:
X_train = train_features
y_train = train_labels['status_group']

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size=0.80, test_size=0.20, 
    stratify=y_train, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 38), (11880, 38), (47520,), (11880,))

In [None]:
categorical_features = [
    'quantity','extraction_type',
    'extraction_type_group','management',
    'management_group','source','source_type',
    'waterpoint_type','basin','region',
    'scheme_management','extraction_type_class',
    'payment','payment_type','waterpoint_type_group',
    
    
]

numeric_features = X_train.select_dtypes('number').columns.tolist()
features = categorical_features + numeric_features

X_train_subset = X_train[features]
X_val_subset = X_val[features]
X_test = test_features[features]

In [8]:


# Make pipeline!
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RobustScaler(),
    SimpleImputer(), 
    RandomForestClassifier(n_estimators=800, n_jobs=-1)
    
)

# Fit on train, score on val, predict on test
pipeline.fit(X_train_subset, y_train)
print('Validation Accuracy', pipeline.score(X_val_subset, y_val))
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-Rob_Simple.csv', index=False)

Validation Accuracy 0.8069023569023569


In [9]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    QuantileTransformer(),
    IterativeImputer(), 
    RandomForestClassifier(n_estimators=800, n_jobs=-1)
    
)

# Fit on train, score on val, predict on test
pipeline.fit(X_train_subset, y_train)
print('Validation Accuracy', pipeline.score(X_val_subset, y_val))
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-Q_Iterative.csv', index=False)

Validation Accuracy 0.8090909090909091


In [10]:
from sklearn.ensemble import AdaBoostClassifier
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    QuantileTransformer(),
    IterativeImputer(), 
    AdaBoostClassifier(n_estimators=800)
    
)

# Fit on train, score on val, predict on test
pipeline.fit(X_train_subset, y_train)
print('Validation Accuracy', pipeline.score(X_val_subset, y_val))
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-ADA_Iterative.csv', index=False)

Validation Accuracy 0.7460437710437711


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [12]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    QuantileTransformer(),
    IterativeImputer(), 
    ExtraTreesClassifier(n_estimators=800, n_jobs=-1)
    
)

# Fit on train, score on val, predict on test
pipeline.fit(X_train_subset, y_train)
print('Validation Accuracy', pipeline.score(X_val_subset, y_val))
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-Tree_Iterative.csv', index=False)

Validation Accuracy 0.7941077441077441


In [None]:


# Filenames of your submissions you want to ensemble
files = ['submission-Rob_Simple.csv', 'submission-ADA_Iterative.csv', 'submission-Q_Iterative.csv', 'submission-Tree_Iterative.csv']

submissions = (pd.read_csv(file)[['status_group']] for file in files)
ensemble = pd.concat(submissions, axis='columns')
majority_vote = ensemble.mode(axis='columns')[0]

submission = sample_submission.copy()
submission['status_group'] = majority_vote
submission.to_csv('my-ultimate-ensemble-submission.csv', index=False)