### Tỷ lệ tự tử của các quốc gia từ năm 1985 to 2016
### So sánh thông tin kinh tế xã hội với tỷ lệ tự tử theo năm và quốc gia

In [1]:
import folium
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import geopandas as gpd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/master.csv')  # suicide rates from 1986 to 2016

In [3]:
df.shape

(27820, 12)

In [4]:
df.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


In [6]:
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [7]:
df[(df['country']=='Albania') & (df['year']==1987)]

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.0,Albania1987,,2156624900,796,Generation X


In [8]:
df = df[df['year'] == 2013]

In [9]:
df = df[['country','suicides/100k pop']].groupby('country').sum().reset_index()

In [10]:
df.head()

Unnamed: 0,country,suicides/100k pop
0,Antigua and Barbuda,0.0
1,Argentina,103.42
2,Armenia,37.48
3,Australia,138.78
4,Austria,212.12


In [11]:
df.tail()

Unnamed: 0,country,suicides/100k pop
75,Turkmenistan,23.29
76,United Kingdom,84.92
77,United States,166.67
78,Uruguay,249.09
79,Uzbekistan,78.86


In [12]:
world_geo = gpd.read_file('data/world-countries.json')

In [13]:
world_geo.head()

Unnamed: 0,id,name,geometry
0,AFG,Afghanistan,"POLYGON ((61.21082 35.65007, 62.23065 35.27066..."
1,AGO,Angola,"MULTIPOLYGON (((16.32653 -5.87747, 16.57318 -6..."
2,ALB,Albania,"POLYGON ((20.59025 41.85540, 20.46317 41.51509..."
3,ARE,United Arab Emirates,"POLYGON ((51.57952 24.24550, 51.75744 24.29407..."
4,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."


In [14]:
# đồng bộ dữ liệu dùng cho so khớp
data = pd.merge(df, world_geo[['name']], left_on='country', right_on='name', how='right')

In [15]:
data.head()

Unnamed: 0,country,suicides/100k pop,name
0,Argentina,103.42,Argentina
1,Armenia,37.48,Armenia
2,Australia,138.78,Australia
3,Austria,212.12,Austria
4,Belarus,273.32,Belarus


In [16]:
# phát hiện các name có trong world-countries.json nhưng không có trong master.csv
data[data['country'].isnull()].sort_values(by='name')['name'].values

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antarctica',
       'Azerbaijan', 'Bangladesh', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brunei', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Central African Republic',
       'Chad', 'China', 'Democratic Republic of the Congo', 'Djibouti',
       'Dominican Republic', 'East Timor', 'Egypt', 'Equatorial Guinea',
       'Eritrea', 'Ethiopia', 'Falkland Islands', 'Fiji',
       'French Southern and Antarctic Lands', 'Gabon', 'Gambia', 'Ghana',
       'Greenland', 'Guinea', 'Guinea Bissau', 'Haiti', 'Honduras',
       'India', 'Indonesia', 'Iran', 'Iraq', 'Ivory Coast', 'Jamaica',
       'Jordan', 'Kenya', 'Kosovo', 'Laos', 'Lebanon', 'Lesotho',
       'Liberia', 'Libya', 'Macedonia', 'Madagascar', 'Malawi',
       'Malaysia', 'Mali', 'Mauritania', 'Moldova', 'Mongolia',
       'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',
       'Nepal', 'New Caledonia', 'Niger', 'Niger

In [17]:
df.replace({
        'United States':'United States of America',
        'Republic of Korea':'South Korea',
        'Russian Federation':'Russia'},
        inplace=True)

In [18]:
df.sort_values(by='suicides/100k pop', ascending=False).head(10) 
# 2 nước có tỷ lệ người tự tử cao nhất: Lithuania và South Korea

Unnamed: 0,country,suicides/100k pop
43,Lithuania,444.56
58,South Korea,428.57
32,Guyana,356.38
39,Kazakhstan,296.67
70,Suriname,286.8
67,Slovenia,283.02
60,Russia,273.97
8,Belarus,273.32
33,Hungary,266.23
78,Uruguay,249.09


In [19]:
world_choropelth = folium.Map(location=[0, 0], tiles='cartodbpositron',zoom_start=1)

world_choropelth.choropleth(
    geo_data=world_geo,
    data=df,
    columns=['country','suicides/100k pop'],
    key_on='feature.properties.name',
    fill_color='YlOrRd',
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Suicide rates per 100k Population - 2013')

world_choropelth

https://www.nagarajbhat.com/post/folium-visualization/

In [20]:
top10 = df.sort_values(by='suicides/100k pop', ascending=False).head(10)
top10

Unnamed: 0,country,suicides/100k pop
43,Lithuania,444.56
58,South Korea,428.57
32,Guyana,356.38
39,Kazakhstan,296.67
70,Suriname,286.8
67,Slovenia,283.02
60,Russia,273.97
8,Belarus,273.32
33,Hungary,266.23
78,Uruguay,249.09


In [21]:
for i,r in top10.iterrows():
    t = r['country']
    p = world_geo[world_geo['name']==t]
    lat, long = p.geometry.centroid.y, p.geometry.centroid.x
    top10.loc[top10['country']==t,'lat']= lat.values[0]
    top10.loc[top10['country']==t,'long']= long.values[0]

In [22]:
top10

Unnamed: 0,country,suicides/100k pop,lat,long
43,Lithuania,444.56,55.284319,23.88064
58,South Korea,428.57,36.427599,127.821317
32,Guyana,356.38,4.790225,-58.971203
39,Kazakhstan,296.67,48.191662,67.284609
70,Suriname,286.8,4.120008,-55.911456
67,Slovenia,283.02,46.125422,14.938152
60,Russia,273.97,61.980841,96.875223
8,Belarus,273.32,53.506344,27.981354
33,Hungary,266.23,47.199951,19.357629
78,Uruguay,249.09,-32.780905,-56.003279


In [23]:
latitudes = list(top10.lat)
longitudes = list(top10.long)
countries = list(top10.country)
suicides = list(top10['suicides/100k pop'])

for lat, lng, country, suicide in zip(latitudes, longitudes, countries, suicides):
    tt = 'Nước: '+country+' -Tỷ lệ người tự tử: '+str(round(suicide,2))
    folium.Marker([lat, lng], popup='', tooltip=tt).add_to(world_choropelth)    
world_choropelth