# Determining at which age a person is most likely to commit sucide.

### Importing the relevent libraries

In [1]:
import numpy as np  # Allows us to work with arrays.
import matplotlib.pyplot as plt  # Allows working with plots.
import pandas as pd  # importing pandas’ library for use. Allows us to import data set and manipulate it.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

### Importing the dataframe

In [2]:
sucidedataframe = pd.read_csv("who_suicide_statistics.csv")  # opens csv files and assighns them to a variable.

### Checking the data from the dataframe before pre-processing

In [3]:
sucidedataframe.head(1)  # Taking a look at the dataframe the first 3 elements of the dataset.

Unnamed: 0,country,year,sex,age,suicides_no,population
0,Albania,1985,female,15-24 years,,277900.0


In [4]:
sucidedataframe.info()  # checking Basic information on the dataframe being procesed.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43776 entries, 0 to 43775
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      43776 non-null  object 
 1   year         43776 non-null  int64  
 2   sex          43776 non-null  object 
 3   age          43776 non-null  object 
 4   suicides_no  41520 non-null  float64
 5   population   38316 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.0+ MB


## 1. Data pre-processing

In [5]:
sucidedataframe.columns  # The columns of the dataframe are viewed.

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population'], dtype='object')

In [6]:
# Below relevent data is selected that will be used in this project.
sucidedataframe = sucidedataframe[["age", "sex", "population", "suicides_no"]]
sucidedataframe.head(1)  # Taking a look at the dataframe to see if the needed changes have been made.

Unnamed: 0,age,sex,population,suicides_no
0,15-24 years,female,277900.0,


In [7]:
# Below the col names are renamed.
sucidedataframe = sucidedataframe.set_axis(["Age", "Gender", "Population", "Suicide_Committed"],axis=1)
sucidedataframe.head(1)  # Taking a look at the dataframe to see if the needed changes have been made.

Unnamed: 0,Age,Gender,Population,Suicide_Committed
0,15-24 years,female,277900.0,


In [8]:
sucidedataframe.shape  # The Entries and the columns of the dataframe are viewed.

(43776, 4)

### Dealing with null values

In [9]:
sucidedataframe.isnull().sum()  # Checking the dataframe for null values.

Age                     0
Gender                  0
Population           5460
Suicide_Committed    2256
dtype: int64

In [10]:
sucidedataframe = sucidedataframe.dropna() # droping all rows with at least one null values.

In [11]:
sucidedataframe.shape  # The Entries and the columns of the dataframe are viewed.

(36060, 4)

In [12]:
sucidedataframe.info()  # checking Basic information on the dataframe being procesed.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36060 entries, 24 to 43763
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                36060 non-null  object 
 1   Gender             36060 non-null  object 
 2   Population         36060 non-null  float64
 3   Suicide_Committed  36060 non-null  float64
dtypes: float64(2), object(2)
memory usage: 1.4+ MB


In [13]:
sucidedataframe.isnull().sum()  # Checking the dataframe for null values.

Age                  0
Gender               0
Population           0
Suicide_Committed    0
dtype: int64

In [14]:
print(sucidedataframe.pivot_table(columns=['Suicide_Committed'], aggfunc='size'))  # counts duplicates in the selected dataframe column.

Suicide_Committed
0.0        5936
1.0        2146
2.0        1489
3.0        1129
4.0         907
           ... 
20705.0       1
21063.0       1
21262.0       1
21706.0       1
22338.0       1
Length: 2231, dtype: int64


In [15]:
sucidedataframe.loc[sucidedataframe.Gender == "female", "Gender"] = 1
sucidedataframe.loc[sucidedataframe.Gender == "male", "Gender"] = 0

In [16]:
print(sucidedataframe.pivot_table(columns=['Suicide_Committed'], aggfunc='size'))  # counts duplicates in the selected dataframe column.

Suicide_Committed
0.0        5936
1.0        2146
2.0        1489
3.0        1129
4.0         907
           ... 
20705.0       1
21063.0       1
21262.0       1
21706.0       1
22338.0       1
Length: 2231, dtype: int64


In [17]:
int(sucidedataframe.loc[sucidedataframe.Suicide_Committed > 0, "Suicide_Committed"] = 1)

In [18]:
X = sucidedataframe.iloc[:, :-1].values  # selecting the values for the X variable.

In [19]:
Y =  sucidedataframe.iloc[:, -1].values  # selecting the values for the Y variable.

In [20]:
sucidedataframe.head(3)

Unnamed: 0,Age,Gender,Population,Suicide_Committed
24,15-24 years,1,289700.0,1.0
25,25-34 years,1,257200.0,1.0
26,35-54 years,1,278800.0,1.0


In [21]:
print(X)

[['15-24 years' 1 289700.0]
 ['25-34 years' 1 257200.0]
 ['35-54 years' 1 278800.0]
 ...
 ['5-14 years' 0 7291.0]
 ['55-74 years' 0 12615.0]
 ['75+ years' 0 2496.0]]


In [22]:
print(Y)

[1. 1. 1. ... 0. 0. 0.]


### One Hot Encoding

In [23]:
# count duplicates function learned from: https://datatofish.com/count-duplicates-pandas/
sucidedataframe.pivot_table(columns=['Age'], aggfunc='size')  # counts duplicates in the selected dataframe column.

Age
15-24 years    6010
25-34 years    6010
35-54 years    6010
5-14 years     6010
55-74 years    6010
75+ years      6010
dtype: int64

In [24]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')    
# transformers= [('what to do', method , [index of column to apply method on])]
# remainder='what to do with the remaining columns'

X = np.array(ct.fit_transform(X))  # applies the above method on the given array.

In [25]:
print(X)

[[1.0 0.0 0.0 ... 0.0 1 289700.0]
 [0.0 1.0 0.0 ... 0.0 1 257200.0]
 [0.0 0.0 1.0 ... 0.0 1 278800.0]
 ...
 [0.0 0.0 0.0 ... 0.0 0 7291.0]
 [0.0 0.0 0.0 ... 0.0 0 12615.0]
 [0.0 0.0 0.0 ... 1.0 0 2496.0]]


### Encoding Classification variables yes and no

In [26]:
print(sucidedataframe.pivot_table(columns=['Gender'], aggfunc='size'))  # counts duplicates in the selected dataframe column.

Gender
0    18030
1    18030
dtype: int64


In [27]:
le = LabelEncoder()  # An Object is created.

#X = le.fit_transform(X)  # The values in the y array are converted and restored from yes  
# and no to 1 and 0 respectively.

X[1] = le.fit_transform(X[1])

In [28]:
print(X)

[[1.0 0.0 0.0 ... 0.0 1 289700.0]
 [0 1 0 ... 0 1 2]
 [0.0 0.0 1.0 ... 0.0 1 278800.0]
 ...
 [0.0 0.0 0.0 ... 0.0 0 7291.0]
 [0.0 0.0 0.0 ... 0.0 0 12615.0]
 [0.0 0.0 0.0 ... 1.0 0 2496.0]]
