In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from matplotlib import pyplot as plt

In [None]:
# Helper Functions
def get_cat_cols(df):
    return df.select_dtypes(include=['object']).columns
def get_numeric_cols(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    return df.select_dtypes(include=numerics).columns

In [3]:
def get_rate(df):
    return (10000*df['suicides_no'].sum()/df['population'].sum())

In [None]:
def run_model(X, y , model=None):
    if model is None:
        model = RandomForestRegressor(random_state=23)
        model.fit(X, y)
    preds = model.predict(X)
    r2score = r2_score(y, preds)    
    mse = mean_squared_error(y, preds)
    print("R2 score is : {}, MSE is :{}".format(r2score,mse))
    return model, r2score, mse

In [None]:
def encode_categorical(df):
    cat_vars = df.select_dtypes(include=['object']).copy().columns
    for var in  cat_vars:
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    return df


In [None]:
def impute_and_scale(X, imputer=None, scaler=None):
    
    if imputer is None:
        imputer = Imputer(strategy='mean')
        imputer.fit(X)
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(X)
        
    X_ = pd.DataFrame(imputer.transform(X))
    X_.columns = X.columns
    X_.index = X.index
    
    scd_X = scaler.transform(X_)
    scd_X = pd.DataFrame(X_)
    scd_X.columns = X_.columns
    
    return imputer, scaler, scd_X

In [None]:
def plot_feature_importances(model, data):
    
    feature_importance = model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    fig = plt.figure(figsize=(40,40))
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, data.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()