# Importing libraries

In [1]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine

# Changing the way pandas displays floats

In [2]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Importing dataset

In [3]:
suicides = pd.read_csv('C:\\Users\\90535\\Downloads\\suicide-rates-overview-1985-to-2016\\master.csv')

In [4]:
suicides.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


# Deleting countries with no data

In [5]:
suic_sum = suicides.groupby('country').sum()

In [6]:
list(suic_sum[suic_sum['suicides_no'] != 0].index)

['Albania',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Bosnia and Herzegovina',
 'Brazil',
 'Bulgaria',
 'Cabo Verde',
 'Canada',
 'Chile',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Ecuador',
 'El Salvador',
 'Estonia',
 'Fiji',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guyana',
 'Hungary',
 'Iceland',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Kazakhstan',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Macau',
 'Maldives',
 'Malta',
 'Mauritius',
 'Mexico',
 'Mongolia',
 'Montenegro',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Norway',
 'Oman',
 'Panama',
 'Paraguay',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Korea',
 'Romania',
 'Russian Federation',
 'Saint Lucia',
 'Sai

In [7]:
suicides = suicides.loc[suicides['country'].isin(list(suic_sum[suic_sum['suicides_no'] != 0].index))]

# Constraining the years of the analysis

In [8]:
suicides = suicides.drop(suicides[suicides.year.isin([1985, 1986, 1987, 1988, 1989, 2016])].index)
suicides.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
36,Albania,1992,male,35-54 years,12,343800,3.49,Albania1992,,709452584,251,Boomers
37,Albania,1992,male,15-24 years,9,263700,3.41,Albania1992,,709452584,251,Generation X
38,Albania,1992,male,55-74 years,5,159500,3.13,Albania1992,,709452584,251,Silent
39,Albania,1992,male,25-34 years,7,245500,2.85,Albania1992,,709452584,251,Boomers
40,Albania,1992,female,15-24 years,7,292400,2.39,Albania1992,,709452584,251,Generation X


# Checking dtypes

In [9]:
suicides.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object

# Renaming columns 

In [10]:
suicides.rename(columns={" gdp_for_year ($) ": "GDP/Year", "gdp_per_capita ($)": "GDP/Capita", "country": "Country",
                        "year": "Year", "sex": "Gender", "suicides_no": "Num_Suicides", "age": "Age", "population":
                        "Population", "suicides/100k pop": "Suicides/100kPop", "generation": "Generation", "HDI for year":
                         "HDI/Year"}, inplace=True)

# Dropping & Re-formatting Columns

In [11]:
# Dropping this column, redundant information
suicides.drop("country-year", axis=1, inplace=True)

In [12]:
# Re-formatting the columns GDP/Year so it can be converted to float
suicides["GDP/Year"] = suicides["GDP/Year"].apply(lambda x: x.replace(",", ""))

In [13]:
suicides.astype({"GDP/Year": 'float'}).dtypes

Country              object
Year                  int64
Gender               object
Age                  object
Num_Suicides          int64
Population            int64
Suicides/100kPop    float64
HDI/Year            float64
GDP/Year            float64
GDP/Capita            int64
Generation           object
dtype: object

In [14]:
suicides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24612 entries, 36 to 27819
Data columns (total 11 columns):
Country             24612 non-null object
Year                24612 non-null int64
Gender              24612 non-null object
Age                 24612 non-null object
Num_Suicides        24612 non-null int64
Population          24612 non-null int64
Suicides/100kPop    24612 non-null float64
HDI/Year            7920 non-null float64
GDP/Year            24612 non-null object
GDP/Capita          24612 non-null int64
Generation          24612 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 2.3+ MB


In [15]:
suicides.shape[0] - suicides['HDI/Year'].isna().sum()

7920

In [16]:
# Since we have so many null values and we won't be using this column, we decided to drop it
suicides.drop("HDI/Year", axis=1, inplace=True)

In [17]:
# Checking null values 
suicides.isna().sum()

Country             0
Year                0
Gender              0
Age                 0
Num_Suicides        0
Population          0
Suicides/100kPop    0
GDP/Year            0
GDP/Capita          0
Generation          0
dtype: int64

In [18]:
suicides.describe()

Unnamed: 0,Year,Num_Suicides,Population,Suicides/100kPop,GDP/Capita
count,24612.0,24612.0,24612.0,24612.0,24612.0
mean,2002.91809,247.35787,1859937.87762,12.91848,17866.15651
std,7.19399,927.35144,3945249.44288,19.13124,19568.29462
min,1990.0,0.0,278.0,0.0,251.0
25%,1997.0,3.0,100754.75,0.96,3724.0
50%,2003.0,26.0,444998.5,5.98,10220.0
75%,2009.0,132.0,1497711.5,16.66,26671.0
max,2015.0,22338.0,43805214.0,224.97,126352.0


In [19]:
suicides[suicides['Num_Suicides'] == 0]

Unnamed: 0,Country,Year,Gender,Age,Num_Suicides,Population,Suicides/100kPop,GDP/Year,GDP/Capita,Generation
44,Albania,1992,female,5-14 years,0,336700,0.00000,709452584,251,Millenials
45,Albania,1992,female,75+ years,0,38700,0.00000,709452584,251,G.I. Generation
46,Albania,1992,male,5-14 years,0,362900,0.00000,709452584,251,Millenials
47,Albania,1992,male,75+ years,0,23900,0.00000,709452584,251,G.I. Generation
59,Albania,1993,female,75+ years,0,39300,0.00000,1228071038,437,G.I. Generation
...,...,...,...,...,...,...,...,...,...,...
27363,Uruguay,1998,female,5-14 years,0,262973,0.00000,25385928198,8420,Millenials
27459,Uruguay,2006,female,5-14 years,0,260187,0.00000,19579457966,6362,Millenials
27471,Uruguay,2007,female,5-14 years,0,257931,0.00000,23410572634,7581,Generation Z
27495,Uruguay,2009,male,5-14 years,0,263516,0.00000,31660911277,10166,Generation Z


In [29]:
suicides.to_csv("C:\\Users\\90535\\IRONHACK\\suicides_cleaned.csv", index=False)

In [21]:
driver = 'mysql+pymysql'
host = "34.90.32.189"
username = 'root'
password = '123456789'
db = "suicides1"
connection_string = f'{driver}://{username}:{password}@{host}/{db}'
connection=create_engine(connection_string)

In [22]:
suicides.head()

Unnamed: 0,Country,Year,Gender,Age,Num_Suicides,Population,Suicides/100kPop,GDP/Year,GDP/Capita,Generation
36,Albania,1992,male,35-54 years,12,343800,3.49,709452584,251,Boomers
37,Albania,1992,male,15-24 years,9,263700,3.41,709452584,251,Generation X
38,Albania,1992,male,55-74 years,5,159500,3.13,709452584,251,Silent
39,Albania,1992,male,25-34 years,7,245500,2.85,709452584,251,Boomers
40,Albania,1992,female,15-24 years,7,292400,2.39,709452584,251,Generation X


In [28]:
suicides.to_sql("suicides1", con=connection, index=False)