In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy  as np
import matplotlib
from matplotlib import pylab as plt

#### Import Dataset

In [3]:
df = pd.read_csv('Suicide Rate ML.csv')
print(df.shape)
print(df.columns)

(27820, 8)
Index(['country', 'year', 'sex', 'age', 'population', 'suicides/100k pop',
       'gdp_per_capita ($)', 'generation'],
      dtype='object')


In [4]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

#### Dataset

In [5]:
feature_names = ['country', 'year', 'sex', 'age', 'population','gdp_per_capita ($)', 'generation']
label = 'suicides/100k pop'
y = df[label]
X = df.drop(columns=[label])

#### ML pipeline

Ridge - Lecture 14 <br>
Random Forest Regression - Lecture 18 <br>
SVM rbf Regression - Lecture 19

In [6]:
standard_ftrs = ['population', 'gdp_per_capita ($)']
onehot_ftrs = ['country', 'sex', 'generation']
ordinal_ftrs = ['age']
minmax_ftrs = ['year']

##### Ridge Regression

In [7]:
def ML_pipeline_kfold_ridge(X,y,random_state,n_folds):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state = random_state)
    best_alphas = []
    test_scores = []
    kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
    
    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]
        
        ss = StandardScaler()
        X_train_ss = ss.fit_transform(X_train[standard_ftrs])
        X_train_ss = pd.DataFrame(X_train_ss, columns = standard_ftrs)
        X_c_ss = ss.transform(X_CV[standard_ftrs])
        X_c_ss = pd.DataFrame(X_c_ss, columns = standard_ftrs)
        X_t_ss = ss.transform(X_test[standard_ftrs])
        X_t_ss = pd.DataFrame(X_t_ss, columns = standard_ftrs)
        
        ohe = OneHotEncoder(sparse = False, categories='auto')
        X_train_ohe = ohe.fit_transform(X_train[onehot_ftrs])
        X_train_ohe = pd.DataFrame(X_train_ohe, columns = ohe.get_feature_names())
        X_c_ohe = ohe.transform(X_CV[onehot_ftrs])
        X_c_ohe = pd.DataFrame(X_c_ohe, columns = ohe.get_feature_names())
        X_t_ohe = ohe.transform(X_test[onehot_ftrs])
        X_t_ohe = pd.DataFrame(X_t_ohe, columns = ohe.get_feature_names())
        
        oe = OrdinalEncoder(categories = [['5-14 years','15-24 years', '25-34 years',
                                       '35-54 years','55-74 years','75+ years']])
        X_train_oe = oe.fit_transform(X_train[ordinal_ftrs])
        X_train_oe = pd.DataFrame(X_train_oe, columns = ordinal_ftrs)
        X_c_oe = oe.transform(X_CV[ordinal_ftrs])
        X_c_oe = pd.DataFrame(X_c_oe, columns = ordinal_ftrs)
        X_t_oe = oe.transform(X_test[ordinal_ftrs])
        X_t_oe = pd.DataFrame(X_t_oe, columns = ordinal_ftrs)
        
        mm = MinMaxScaler()
        X_train_mm = mm.fit_transform(X_train[minmax_ftrs])
        X_train_mm = pd.DataFrame(X_train_mm, columns = minmax_ftrs)
        X_c_mm = mm.transform(X_CV[minmax_ftrs])
        X_c_mm = pd.DataFrame(X_c_mm, columns = minmax_ftrs)
        X_t_mm = mm.transform(X_test[minmax_ftrs])
        X_t_mm = pd.DataFrame(X_t_mm, columns = minmax_ftrs)

        X_train_ = pd.concat([X_train_mm, X_train_oe, X_train_ohe, X_train_ss], axis=1)
        X_CV_ = pd.concat([X_c_mm, X_c_oe, X_c_ohe, X_c_ss], axis=1)
        X_test_ = pd.concat([X_t_mm, X_t_oe, X_t_ohe, X_t_ss], axis=1)
        
        alpha = np.logspace(-16,1,100)
        CV_score = []
        regs = []
        for a in alpha:
            reg = Ridge(alpha = a)
            reg.fit(X_train_, y_train)
            CV_score.append(reg.score(X_CV_, y_CV))
            regs.append(reg)
            
        best_alpha = alpha[np.argmax(CV_score)]
        best_alphas.append(best_alpha)
        
        reg = regs[np.argmax(CV_score)]
        test_scores.append(reg.score(X_test_,y_test))
        
    BA = best_alphas[np.argmax(test_scores)]
    return BA, test_scores

In [8]:
TS_ridge = []
for i in range(10):
    best_alpha, test_score = ML_pipeline_kfold_ridge(X, y, 42*i, 5)
    TS_ridge.append(test_score)
    print('Random state = {} and best alpha = {}.'.format(42*i, best_alpha))
print('test accuracy score:', np.around(np.mean(TS_ridge),3), '+/-', np.around(np.std(TS_ridge),3))

Random state = 0 and best alpha = 10.0.
Random state = 42 and best alpha = 2.056512308348643.
Random state = 84 and best alpha = 10.0.
Random state = 126 and best alpha = 6.734150657750801.
Random state = 168 and best alpha = 8.302175681319735e-14.
Random state = 210 and best alpha = 3.0538555088334123.
Random state = 252 and best alpha = 4.534878508128592.
Random state = 294 and best alpha = 2.205130739903041e-16.
Random state = 336 and best alpha = 0.2848035868435793.
Random state = 378 and best alpha = 1e-16.
test accuracy score: 0.509 +/- 0.009


##### Random Forest

In [9]:
def ML_pipeline_kfold_rf(X, y, random_state, n_folds):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state = random_state)
    best_ds = []
    test_scores = []
    kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
    
    for train_index, CV_index in kf.split(X_other, y_other):
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]
        
        ss = StandardScaler()
        X_train_ss = ss.fit_transform(X_train[standard_ftrs])
        X_train_ss = pd.DataFrame(X_train_ss, columns = standard_ftrs)
        X_c_ss = ss.transform(X_CV[standard_ftrs])
        X_c_ss = pd.DataFrame(X_c_ss, columns = standard_ftrs)
        X_t_ss = ss.transform(X_test[standard_ftrs])
        X_t_ss = pd.DataFrame(X_t_ss, columns = standard_ftrs)
        
        ohe = OneHotEncoder(sparse = False, categories='auto')
        X_train_ohe = ohe.fit_transform(X_train[onehot_ftrs])
        X_train_ohe = pd.DataFrame(X_train_ohe, columns = ohe.get_feature_names())
        X_c_ohe = ohe.transform(X_CV[onehot_ftrs])
        X_c_ohe = pd.DataFrame(X_c_ohe, columns = ohe.get_feature_names())
        X_t_ohe = ohe.transform(X_test[onehot_ftrs])
        X_t_ohe = pd.DataFrame(X_t_ohe, columns = ohe.get_feature_names())
        
        oe = OrdinalEncoder(categories = [['5-14 years','15-24 years', '25-34 years',
                                       '35-54 years','55-74 years','75+ years']])
        X_train_oe = oe.fit_transform(X_train[ordinal_ftrs])
        X_train_oe = pd.DataFrame(X_train_oe, columns = ordinal_ftrs)
        X_c_oe = oe.transform(X_CV[ordinal_ftrs])
        X_c_oe = pd.DataFrame(X_c_oe, columns = ordinal_ftrs)
        X_t_oe = oe.transform(X_test[ordinal_ftrs])
        X_t_oe = pd.DataFrame(X_t_oe, columns = ordinal_ftrs)
        
        mm = MinMaxScaler()
        X_train_mm = mm.fit_transform(X_train[minmax_ftrs])
        X_train_mm = pd.DataFrame(X_train_mm, columns = minmax_ftrs)
        X_c_mm = mm.transform(X_CV[minmax_ftrs])
        X_c_mm = pd.DataFrame(X_c_mm, columns = minmax_ftrs)
        X_t_mm = mm.transform(X_test[minmax_ftrs])
        X_t_mm = pd.DataFrame(X_t_mm, columns = minmax_ftrs)

        X_train_ = pd.concat([X_train_mm, X_train_oe, X_train_ohe, X_train_ss], axis=1)
        X_CV_ = pd.concat([X_c_mm, X_c_oe, X_c_ohe, X_c_ss], axis=1)
        X_test_ = pd.concat([X_t_mm, X_t_oe, X_t_ohe, X_t_ss], axis=1)
        
        depths = [depth for depth in range(7,16)]
        splits = [split for split in range(3,12)]
        ds = [(depth, split) for depth in depths for split in splits]
        CV_score = []
        clfs = []
        for d, s in ds:
            clf = RandomForestRegressor(n_estimators=100, random_state=random_state,
                                         max_depth=d, min_samples_split=s)
            clf.fit(X_train_, y_train)
            CV_score.append(clf.score(X_CV_, y_CV))
            clfs.append(clf)
        best_d = ds[np.argmax(CV_score)][0]
        best_s = ds[np.argmax(CV_score)][1]
        best_ds.append((best_d, best_s))
        clf = clfs[np.argmax(CV_score)]
        test_scores.append(clf.score(X_test_,y_test))
    DS = best_ds[np.argmax(test_scores)]
    return DS, test_scores

In [10]:
TS_rf = []
for i in range(10):
    best_ds, test_score = ML_pipeline_kfold_rf(X, y, 42*i,5)
    TS_rf.append(test_score)
    print('Random state = {}, best max_depth = {}, and best min_samples_split = {}.'.format(42*i,
                                                                                             best_ds[0], best_ds[1]))
print('test accuracy score:', np.around(np.mean(TS_rf),3), '+/-', np.around(np.std(TS_rf),3))

Random state = 0, best max_depth = 15, and best min_samples_split = 5.
Random state = 42, best max_depth = 15, and best min_samples_split = 3.
Random state = 84, best max_depth = 15, and best min_samples_split = 3.
Random state = 126, best max_depth = 15, and best min_samples_split = 5.
Random state = 168, best max_depth = 15, and best min_samples_split = 3.
Random state = 210, best max_depth = 15, and best min_samples_split = 6.
Random state = 252, best max_depth = 15, and best min_samples_split = 4.
Random state = 294, best max_depth = 15, and best min_samples_split = 4.
Random state = 336, best max_depth = 15, and best min_samples_split = 4.
Random state = 378, best max_depth = 15, and best min_samples_split = 4.
test accuracy score: 0.742 +/- 0.015


##### SVR

In [11]:
def ML_pipeline_kfold_svr(X, y, random_state, n_folds):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state = random_state)
    best_gc = []
    test_scores = []
    kf = KFold(n_splits=5,shuffle=True,random_state=random_state) 
    
    for train_index, CV_index in kf.split(X_other, y_other):
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]
        
        ss = StandardScaler()
        X_train_ss = ss.fit_transform(X_train[standard_ftrs])
        X_train_ss = pd.DataFrame(X_train_ss, columns = standard_ftrs)
        X_c_ss = ss.transform(X_CV[standard_ftrs])
        X_c_ss = pd.DataFrame(X_c_ss, columns = standard_ftrs)
        X_t_ss = ss.transform(X_test[standard_ftrs])
        X_t_ss = pd.DataFrame(X_t_ss, columns = standard_ftrs)
        
        ohe = OneHotEncoder(sparse = False, categories='auto')
        X_train_ohe = ohe.fit_transform(X_train[onehot_ftrs])
        X_train_ohe = pd.DataFrame(X_train_ohe, columns = ohe.get_feature_names())
        X_c_ohe = ohe.transform(X_CV[onehot_ftrs])
        X_c_ohe = pd.DataFrame(X_c_ohe, columns = ohe.get_feature_names())
        X_t_ohe = ohe.transform(X_test[onehot_ftrs])
        X_t_ohe = pd.DataFrame(X_t_ohe, columns = ohe.get_feature_names())
        
        oe = OrdinalEncoder(categories = [['5-14 years','15-24 years', '25-34 years',
                                       '35-54 years','55-74 years','75+ years']])
        X_train_oe = oe.fit_transform(X_train[ordinal_ftrs])
        X_train_oe = pd.DataFrame(X_train_oe, columns = ordinal_ftrs)
        X_c_oe = oe.transform(X_CV[ordinal_ftrs])
        X_c_oe = pd.DataFrame(X_c_oe, columns = ordinal_ftrs)
        X_t_oe = oe.transform(X_test[ordinal_ftrs])
        X_t_oe = pd.DataFrame(X_t_oe, columns = ordinal_ftrs)
        
        mm = MinMaxScaler()
        X_train_mm = mm.fit_transform(X_train[minmax_ftrs])
        X_train_mm = pd.DataFrame(X_train_mm, columns = minmax_ftrs)
        X_c_mm = mm.transform(X_CV[minmax_ftrs])
        X_c_mm = pd.DataFrame(X_c_mm, columns = minmax_ftrs)
        X_t_mm = mm.transform(X_test[minmax_ftrs])
        X_t_mm = pd.DataFrame(X_t_mm, columns = minmax_ftrs)

        X_train_ = pd.concat([X_train_mm, X_train_oe, X_train_ohe, X_train_ss], axis=1)
        X_CV_ = pd.concat([X_c_mm, X_c_oe, X_c_ohe, X_c_ss], axis=1)
        X_test_ = pd.concat([X_t_mm, X_t_oe, X_t_ohe, X_t_ss], axis=1)
        
        cs = np.logspace(-3, 10, 13)
        gammas = np.logspace(-9, 3, 13)
        gc = [(gamma, c) for gamma in gammas for c in cs]
        CV_score = []
        clfs = []
        for g, c in gc:
            clf = SVR(gamma=g, C=c)
            clf.fit(X_train_, y_train)
            CV_score.append(clf.score(X_CV_, y_CV))
            clfs.append(clf)
        best_g = gc[np.argmax(CV_score)][0]
        best_c = gc[np.argmax(CV_score)][1]
        best_gc.append((best_g, best_c))
        clf = clfs[np.argmax(CV_score)]
    test_scores.append(clf.score(X_test_,y_test))
    GC = best_gc[np.argmax(test_scores)]
    return GC, test_scores
    

In [None]:
TS_svr = []
for i in range(10):
    best_gc, test_score = ML_pipeline_kfold_svr(X, y, 42*i, 5)
    TS_svr.append(test_score)
    print('Random state = {}, best gamma = {}, and best C = {}.'.format(42*i, 
                                                                         best_gc[0], best_gc[1]))

print('test accuracy score:', np.around(np.mean(TS_svr),3), '+/-', np.around(np.std(TS_svr),3))

#### Feature Importance

In [None]:
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = 294)
kf = KFold(n_splits=5, shuffle=True, random_state=294)

ds = [8, 5, 7, 9, 8]
ss = [300, 363, 291, 201, 273]
ind = 0
for train_index, CV_index in kf.split(X_other, y_other):
    X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
    y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]
        
    X_train_, X_CV_, X_test_ = preprocess(X_train, X_CV, X_test, random_state=294)

    clf_rf = RandomForestClassifier(n_estimators=100, random_state=294, max_depth=ds[ind], min_samples_split=ss[ind])
    clf_rf.fit(X_train_, y_train)
    imp = clf_rf.feature_importances_
    
    print(imp)
    plt.bar(range(len(imp)), imp)
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.title("Global Feature Importance - Random Forest Classifier")
    plt.xticks(range(len(imp)), proc_ftrs, rotation='vertical')
    plt.savefig('../figures/random_forest_global_ftr_importance{}'.format(ind), dpi=300, bbox_inches = "tight")
    plt.show()
    ind += 1

