
### I wanted to import any required libraries, and look at the data. Also, I wanted to do some pleminary preprocessing so I could have some context before continuing with my EDA


In [1]:
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)

In [2]:
df = pd.read_csv('master.csv')


In [3]:
#to see what is correlated with suicide number on the surface level
df.corr()

Unnamed: 0,year,suicides_no,population,suicides/100k pop,HDI for year,gdp_per_capita ($)
year,1.0,-0.004546,0.00885,-0.039037,0.366786,0.339134
suicides_no,-0.004546,1.0,0.616162,0.306604,0.151399,0.06133
population,0.00885,0.616162,1.0,0.008285,0.102943,0.08151
suicides/100k pop,-0.039037,0.306604,0.008285,1.0,0.074279,0.001785
HDI for year,0.366786,0.151399,0.102943,0.074279,1.0,0.771228
gdp_per_capita ($),0.339134,0.06133,0.08151,0.001785,0.771228,1.0


In [4]:
df.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [5]:
# Check for duplicates, which adds a new column to the dataset
df["is_duplicate"]= df.duplicated()


print(f"#total= {len(df)}")
print(f"#duplicated= {len(df[df['is_duplicate']==True])}")

#total= 27820
#duplicated= 0



The most important information to machine learn would be suicide rate / 100k. It can be learned because it can be plotted and mapped, but at the end of the day human behaviour and mental health diseases cannot be generalised and reduced to numbers. Every human's emotion is unique and cannot be generalised down to a predicted outcome. <br> 

Although usually most ML algorithms have overlap of both, classification predicts discrete class labels. Since my goal is to develop a machine learned model to predict, if the outcome would be suicide or not, as a binary dependent variable, a classification problem would be best. <br>


Upon first glance, and with the help of df.corr(), The dependent variable should be 'suicides/100k pop'. This is what we are trying to predict as it gives the most holistic information about suicides occuring in the population, regardless of the absolute total number.



Performing df.corr() on all the variables will help in ranking the features. However not all the columns are numerical and some are categorical, so they need to be converted to numerical by doing one hot encoding or factorizing. I think, personally, one hot encoding works for this particular dataset. <br>


Some preprocessing was done before my writeups. I will now check if there are any NaN values of the sort. Already checked that there are no duplicates. <br>

First, in my experience, coding langauges do not do well with spaces as characters, so I will rename some columns, like HDI for year.



In [6]:
#renaming columns
df.rename(columns={"HDI for year":"yearly_HDI"}, inplace=True)
df.rename(columns={" gdp_for_year ($) ":"gdp"}, inplace=True)

# Do we have NaN in our dataset?
df.isnull().any()

country               False
year                  False
sex                   False
age                   False
suicides_no           False
population            False
suicides/100k pop     False
country-year          False
yearly_HDI             True
gdp                   False
gdp_per_capita ($)    False
generation            False
is_duplicate          False
dtype: bool

There are only NaN values in 'HDI for year'.

In [7]:
x = df['yearly_HDI'].isnull().sum()
y = len(df['yearly_HDI'])
print ('There are',x,'entries with NaN in yearly_HDI column')
print ('That means,',100*x/y,'% of entries are NaN values in this column.')

There are 19456 entries with NaN in yearly_HDI column
That means, 69.9352983465133 % of entries are NaN values in this column.


Clearly, it would be wise to drop this feature. Lets also drop 'country' and 'year' since there is already a country-year variable making the other two redundant. Also dropping gdp since there is already a gdp per capita


In [8]:
cols = ['year','yearly_HDI','country','gdp','is_duplicate']
df=df.drop(cols,axis='columns')



In [10]:
#sanity check
df.head()

Unnamed: 0,sex,age,suicides_no,population,suicides/100k pop,country-year,gdp_per_capita ($),generation
0,male,15-24 years,21,312900,6.71,Albania1987,796,Generation X
1,male,35-54 years,16,308000,5.19,Albania1987,796,Silent
2,female,15-24 years,14,289700,4.83,Albania1987,796,Generation X
3,male,75+ years,1,21800,4.59,Albania1987,796,G.I. Generation
4,male,25-34 years,9,274300,3.28,Albania1987,796,Boomers


In [11]:
#To check missing nominal values, lets use a loop function
for col in df.columns:
    if df[col].dtype == object:
        print(col, df[col].unique())

sex ['male' 'female']
age ['15-24 years' '35-54 years' '75+ years' '25-34 years' '55-74 years'
 '5-14 years']
country-year ['Albania1987' 'Albania1988' 'Albania1989' ... 'Uzbekistan2012'
 'Uzbekistan2013' 'Uzbekistan2014']
generation ['Generation X' 'Silent' 'G.I. Generation' 'Boomers' 'Millenials'
 'Generation Z']


Luckily, there are no '?' so nothing needs to be imputed

## One Hot Encoding so that Random Tree Forest Classifier can be devised

In [12]:
def encode_onehot(_df, _f):
    _df2 = pd.get_dummies(_df[_f], prefix='', prefix_sep='').groupby(level=0, axis=1).max().add_prefix(_f+' - ')
    _df3 = pd.concat([_df, _df2], axis=1)
    _df3 = _df3.drop([_f], axis=1)
    return _df3

# Print nominal variables
for f in list(df.columns.values):
    if df[f].dtype == object:
        print(f) 

sex
age
country-year
generation


In [13]:
# Display the original
display(df['sex'][:10])

# Apply the onehot-encoding method
df_o = encode_onehot(df, 'sex')

# Check the onehot-encoded version of this feature
cols = []
for f in list(df_o.columns.values):
    if 'sex' in f:
        cols += [f]

0      male
1      male
2    female
3      male
4      male
5    female
6    female
7    female
8      male
9    female
Name: sex, dtype: object

In [14]:
# Display the onehot-encoded        
display(df_o[cols][:10])

Unnamed: 0,sex - female,sex - male
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
5,1,0
6,1,0
7,1,0
8,0,1
9,1,0


In [15]:
#apply one hot encoding to all the nominal columns
df_o = encode_onehot(df_o, 'country-year')
df_o = encode_onehot(df_o, 'age')
df_o = encode_onehot(df_o, 'generation')


In [16]:
#shows if the one hot encoding worked
print(f'before={len(df.columns)}, after={len(df_o.columns)}')
df_o.head()

before=8, after=2339


Unnamed: 0,suicides_no,population,suicides/100k pop,gdp_per_capita ($),sex - female,sex - male,country-year - Albania1987,country-year - Albania1988,country-year - Albania1989,country-year - Albania1992,...,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent
0,21,312900,6.71,796,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,16,308000,5.19,796,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,14,289700,4.83,796,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,21800,4.59,796,0,1,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,9,274300,3.28,796,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [18]:
X = df_o.loc[:, df_o.columns != 'suicides/100k pop'].values
y = df_o.loc[:, df_o.columns == 'suicides/100k pop'].values.ravel()


In [19]:
df['suicides/100k pop'].describe()

count    27820.000000
mean        12.816097
std         18.961511
min          0.000000
25%          0.920000
50%          5.990000
75%         16.620000
max        224.970000
Name: suicides/100k pop, dtype: float64

In [21]:
#binning the variables acccordingly to apply the random tree classifier
bins = [0, 1, 6, 17,225]
labels = ['D','C', 'B', 'A']
y = df['bin_suicide'] = pd.cut(df['suicides/100k pop'], bins = bins, labels = labels, include_lowest = True)

In [22]:
#random tree classifier 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

def rf_train_test(_X_tr, _X_ts, _y_tr, _y_ts):
    # Create a new random forest classifier, with working 4 parallel cores
    rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=None, n_jobs=4)
    # Train on training data
    rf.fit(_X_tr, _y_tr)
    # Test on training data
    y_pred = rf.predict(_X_ts)
    # Return more proper evaluation metric
    # return f1_score(_y_ts, y_pred, pos_label='recurrence-events', zero_division=0)
    # Return accuracy
    return accuracy_score(_y_ts, y_pred)

In [23]:
def eval_classifier(_X, _y, niter):
    accuracies = []
    kf = StratifiedKFold(n_splits=10,shuffle=False,random_state=None)
    for tr_ix, ts_ix in kf.split(_X, _y):
        accuracy = rf_train_test(_X[tr_ix], _X[ts_ix], _y[tr_ix], _y[ts_ix])
        accuracies += [accuracy]
    print( (f'Stratified 10-fold cross validation accuracy is '
            f'{np.mean(accuracies):.3f} {chr(177)}{np.std(accuracies):.4f} with {niter} total iterations')
         )

eval_classifier(X, y, 100)

Stratified 10-fold cross validation accuracy is 0.630 ±0.0291 with 100 total iterations


Classifier applied with a 63% accuracy