## Analyzing the data

In [24]:
import pandas
import numpy
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from pandas import get_dummies
from pandas import Series

# Load the dataset
filename = 'suicide-rates.csv'
data = pandas.read_csv(filename)
#print(data.isnull().sum()) # Checks the values with NaN values
data = data.drop(columns=['HDI for year', 'country-year', 'year', 'country', 'generation']) # too many NaN values

In [25]:
# Transform gdp_for_year from string to integer
gdp_for_year = numpy.array([], dtype=numpy.int64)
for sample in data["gdp_for_year ($)"].values:
    integer_value = int(sample.replace(",", ""))
    gdp_for_year = numpy.append(gdp_for_year, integer_value)
data["gdp_for_year ($)"] = gdp_for_year

In [26]:
# Transform male and female from categorical values to quantitative
genders = get_dummies(data['sex'])
for key in genders.keys():
    data[key] = genders[key]
data = data.drop(columns=['sex'])

# Transform age data from categorical values
age_mapping = {
    '5-14 years': 0,
    '15-24 years': 1,
    '25-34 years': 2,
    '35-54 years': 3,
    '55-74 years': 4,
    '75+ years': 5
}
data['age'] = data['age'].map(age_mapping)

# Scale values
scaler = MinMaxScaler(feature_range=(0, 1))

X = data.drop(columns=['suicides_no', 'suicides/100k pop'])
X['age'] = scaler.fit_transform(X[['age']])
X['population'] = scaler.fit_transform(X[['population']])
X['gdp_for_year ($)'] = scaler.fit_transform(X[['gdp_for_year ($)']])
X['gdp_per_capita ($)'] = scaler.fit_transform(X[['gdp_per_capita ($)']])

Y1 = scaler.fit_transform(data[['suicides_no']])
Y1 = scaler.fit_transform(data[['suicides/100k pop']])


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# evaluate using a train and a test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y1, test_size=0.33, random_state=7)
model = LinearRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result * 100.0))

Accuracy: 28.908%
