# Importing libraries

In [1]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine

# Changing the way pandas displays floats

In [2]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Importing dataset

In [3]:
suicides = pd.read_csv('C:\\Users\\90535\\Downloads\\suicide-rates-overview-1985-to-2016\\master.csv')

In [4]:
suicides.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


# Checking dtypes

In [5]:
suicides.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object

# Renaming columns 

In [6]:
suicides.rename(columns={" gdp_for_year ($) ": "GDP/Year", "gdp_per_capita ($)": "GDP/Capita", "country": "Country",
                        "year": "Year", "sex": "Gender", "suicides_no": "Num_Suicides", "age": "Age", "population":
                        "Population", "suicides/100k pop": "Suicides/100kPop", "generation": "Generation", "HDI for year":
                         "HDI/Year"}, inplace=True)

In [7]:
# Dropping this column, redundant information
suicides.drop("country-year", axis=1, inplace=True)

In [8]:
# Re-formatting the columns GDP/Year so it can be converted to float
suicides["GDP/Year"] = suicides["GDP/Year"].apply(lambda x: x.replace(",", ""))

In [9]:
suicides.astype({"GDP/Year": 'float'}).dtypes

Country              object
Year                  int64
Gender               object
Age                  object
Num_Suicides          int64
Population            int64
Suicides/100kPop    float64
HDI/Year            float64
GDP/Year            float64
GDP/Capita            int64
Generation           object
dtype: object

In [10]:
suicides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 11 columns):
Country             27820 non-null object
Year                27820 non-null int64
Gender              27820 non-null object
Age                 27820 non-null object
Num_Suicides        27820 non-null int64
Population          27820 non-null int64
Suicides/100kPop    27820 non-null float64
HDI/Year            8364 non-null float64
GDP/Year            27820 non-null object
GDP/Capita          27820 non-null int64
Generation          27820 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 2.3+ MB


In [11]:
suicides.shape[0] - suicides['HDI/Year'].isna().sum()

8364

In [12]:
# Since we have so many null values and we won't be using this column, we decided to drop it
suicides.drop("HDI/Year", axis=1, inplace=True)

In [13]:
# Checking null values 
suicides.isna().sum()

Country             0
Year                0
Gender              0
Age                 0
Num_Suicides        0
Population          0
Suicides/100kPop    0
GDP/Year            0
GDP/Capita          0
Generation          0
dtype: int64

In [14]:
suicides.describe()

Unnamed: 0,Year,Num_Suicides,Population,Suicides/100kPop,GDP/Capita
count,27820.0,27820.0,27820.0,27820.0,27820.0
mean,2001.25838,242.57441,1844793.6174,12.8161,16866.46441
std,8.46906,902.04792,3911779.44176,18.96151,18887.57647
min,1985.0,0.0,278.0,0.0,251.0
25%,1995.0,3.0,97498.5,0.92,3447.0
50%,2002.0,25.0,430150.0,5.99,9372.0
75%,2008.0,131.0,1486143.25,16.62,24874.0
max,2016.0,22338.0,43805214.0,224.97,126352.0


In [15]:
suicides[suicides['Num_Suicides'] == 0]

Unnamed: 0,Country,Year,Gender,Age,Num_Suicides,Population,Suicides/100kPop,GDP/Year,GDP/Capita,Generation
9,Albania,1987,female,5-14 years,0,311000,0.00000,2156624900,796,Generation X
10,Albania,1987,female,55-74 years,0,144600,0.00000,2156624900,796,G.I. Generation
11,Albania,1987,male,5-14 years,0,338200,0.00000,2156624900,796,Generation X
22,Albania,1988,female,5-14 years,0,317200,0.00000,2126000000,769,Generation X
23,Albania,1988,male,5-14 years,0,345000,0.00000,2126000000,769,Generation X
...,...,...,...,...,...,...,...,...,...,...
27363,Uruguay,1998,female,5-14 years,0,262973,0.00000,25385928198,8420,Millenials
27459,Uruguay,2006,female,5-14 years,0,260187,0.00000,19579457966,6362,Millenials
27471,Uruguay,2007,female,5-14 years,0,257931,0.00000,23410572634,7581,Generation Z
27495,Uruguay,2009,male,5-14 years,0,263516,0.00000,31660911277,10166,Generation Z


In [20]:
driver = 'mysql+pymysql'
host = "34.90.32.189"
username = 'root'
password = '123456789'
db = "suicides1"
connection_string = f'{driver}://{username}:{password}@{host}/{db}'
connection=create_engine(connection_string)
engine = create_engine(connection_string)

In [21]:
suicides.head()

Unnamed: 0,Country,Year,Gender,Age,Num_Suicides,Population,Suicides/100kPop,GDP/Year,GDP/Capita,Generation
0,Albania,1987,male,15-24 years,21,312900,6.71,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,2156624900,796,Boomers


In [22]:
suicides.to_sql("suicides1", con=connection)

In [31]:
suicides_year = suicides.groupby("Year").sum().sort_values("Num_Suicides", ascending=False)
suicides_year.drop("")
suicides_year.head()

Unnamed: 0_level_0,Num_Suicides,Population,Suicides/100kPop,GDP/Capita
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999,256119,1776363155,14473.91,12780864
2002,256095,1822152815,14227.72,13017420
2003,256079,1838458020,13627.58,15187104
2000,255832,1799227908,14387.45,12865476
2001,250652,1755565489,14276.21,12677892
