### importing libraries and modules


In [1]:
import pandas as pd
import numpy as np
import psycopg2 # needed to get database exception errors when uploading dataframe
import requests # package for getting data from the web
from zipfile import * # package for unzipping zip files

In [2]:
# Import the get_engine function from our sql_functions.
from sql_functions import get_engine

### read in data: life expectancy

In [3]:
life_expectancy_df = pd.read_csv('data/life_expectancy.csv', low_memory = False)

### get overview

In [4]:
life_expectancy_df.head()

Unnamed: 0,Entity,Code,Year,Indicator:Life expectancy at birth (years) - Sex:Both sexes
0,Afghanistan,AFG,2000,54.98949
1,Afghanistan,AFG,2010,59.94055
2,Afghanistan,AFG,2015,61.65429
3,Afghanistan,AFG,2019,63.2099
4,Africa,,2000,52.65365


In [5]:
life_expectancy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 4 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Entity                                                       776 non-null    object 
 1   Code                                                         736 non-null    object 
 2   Year                                                         776 non-null    int64  
 3   Indicator:Life expectancy at birth (years) - Sex:Both sexes  776 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 24.4+ KB


In [6]:
736/4

184.0

In [7]:
life_expectancy_df.describe()

Unnamed: 0,Year,Indicator:Life expectancy at birth (years) - Sex:Both sexes
count,776.0,776.0
mean,2011.0,70.211433
std,7.110918,8.576001
min,2000.0,31.2792
25%,2007.5,64.491197
50%,2012.5,72.07811
75%,2016.0,76.419915
max,2019.0,84.26138


In [8]:
life_expectancy_df.Entity

0      Afghanistan
1      Afghanistan
2      Afghanistan
3      Afghanistan
4           Africa
          ...     
771         Zambia
772       Zimbabwe
773       Zimbabwe
774       Zimbabwe
775       Zimbabwe
Name: Entity, Length: 776, dtype: object

## edit table: rename columns, drop "Code", round numbers

In [9]:
life_expectancy_df = life_expectancy_df.rename(columns={"Entity": "country", "Year": "year","Indicator:Life expectancy at birth (years) - Sex:Both sexes":"life_expectancy"})

In [10]:
life_expectancy_df = life_expectancy_df.drop("Code",axis=1)

In [11]:
life_expectancy_df = life_expectancy_df.round({'life_expectancy':1})

In [12]:
life_expectancy_df.head()

Unnamed: 0,country,year,life_expectancy
0,Afghanistan,2000,55.0
1,Afghanistan,2010,59.9
2,Afghanistan,2015,61.7
3,Afghanistan,2019,63.2
4,Africa,2000,52.7


In [13]:
life_expectancy_df["country"].nunique()

194

In [14]:
life_expectancy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          776 non-null    object 
 1   year             776 non-null    int64  
 2   life_expectancy  776 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 18.3+ KB


In [15]:
life_expectancy_df.dropna()

Unnamed: 0,country,year,life_expectancy
0,Afghanistan,2000,55.0
1,Afghanistan,2010,59.9
2,Afghanistan,2015,61.7
3,Afghanistan,2019,63.2
4,Africa,2000,52.7
...,...,...,...
771,Zambia,2019,62.5
772,Zimbabwe,2000,46.6
773,Zimbabwe,2010,51.5
774,Zimbabwe,2015,58.5


In [16]:
life_expectancy_df.head()

Unnamed: 0,country,year,life_expectancy
0,Afghanistan,2000,55.0
1,Afghanistan,2010,59.9
2,Afghanistan,2015,61.7
3,Afghanistan,2019,63.2
4,Africa,2000,52.7


In [17]:
list_european_countries = ('Albania',
'Andorra',
'Belgium',
'Bosnia and Herzegovina',
'Bulgaria',
'Denmark',
'Germany',
'Estonia',
'Finland',
'France',
'Greece',
'Ireland',
'Iceland',
'Italy',
'Kazakhstan',
'Kosovo',
'Croatia',
'Latvia',
'Liechtenstein',
'Lithuania',
'Luxembourg',
'Malta',
'Moldova',
'Monaco',
'Montenegro',
'Netherlands',
'North Macedonia',
'Norway',
'Austria',
'Poland',
'Portugal',
'Romania',
'Russia',
'San Marino',
'Sweden',
'Switzerland',
'Serbia',
'Slovakia',
'Slovenia',
'Spain',
'Czech Republic',
'Turkey',
'Ukraine',
'Hungary',
'Vatican',
'United Kingdom',
'Belarus')


In [18]:
life_expectancy_df= life_expectancy_df.loc[life_expectancy_df.country.isin(list_european_countries)]

In [19]:
life_expectancy_df=life_expectancy_df.loc[life_expectancy_df.year.isin({2019})]

In [20]:
european_life_expectancy_df= life_expectancy_df.loc[life_expectancy_df.country.isin(list_european_countries)]

In [21]:
european_life_expectancy_df.head()

Unnamed: 0,country,year,life_expectancy
11,Albania,2019,78.0
43,Austria,2019,81.6
67,Belarus,2019,74.8
71,Belgium,2019,81.4
91,Bosnia and Herzegovina,2019,76.8


In [84]:
european_life_expectancy_df.to_excel("european_life_expectancy.xlsx", sheet_name="european_life_expectancy")

## new dataframe: maternal deaths

In [22]:
maternal_deaths_df = pd.read_csv('data/maternal_deaths.csv', low_memory = False)

In [23]:
maternal_deaths_df.head()

Unnamed: 0,Entity,Code,Year,Maternal Mortality Ratio (Gapminder (2010) and World Bank (2015)),"GDP per capita, PPP (constant 2017 international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,2000,1450.0,,19542986.0,
2,Afghanistan,AFG,2001,1390.0,,19688634.0,
3,Afghanistan,AFG,2002,1300.0,1189.784668,21000258.0,
4,Afghanistan,AFG,2003,1240.0,1235.810059,22645136.0,


In [24]:
maternal_deaths_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57964 entries, 0 to 57963
Data columns (total 7 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   Entity                                                             57964 non-null  object 
 1   Code                                                               54388 non-null  object 
 2   Year                                                               57964 non-null  int64  
 3   Maternal Mortality Ratio (Gapminder (2010) and World Bank (2015))  5800 non-null   float64
 4   GDP per capita, PPP (constant 2017 international $)                6166 non-null   float64
 5   Population (historical estimates)                                  57172 non-null  float64
 6   Continent                                                          285 non-null    object 
dtypes: float64(3), int64(1)

In [25]:
#life_expectancy_df = life_expectancy_df.drop("Code",axis=1)
maternal_deaths_df = maternal_deaths_df.drop(["Code","Continent"],axis=1)

In [26]:
maternal_deaths_df.head()

Unnamed: 0,Entity,Year,Maternal Mortality Ratio (Gapminder (2010) and World Bank (2015)),"GDP per capita, PPP (constant 2017 international $)",Population (historical estimates)
0,Abkhazia,2015,,,
1,Afghanistan,2000,1450.0,,19542986.0
2,Afghanistan,2001,1390.0,,19688634.0
3,Afghanistan,2002,1300.0,1189.784668,21000258.0
4,Afghanistan,2003,1240.0,1235.810059,22645136.0


In [27]:
maternal_deaths_df.describe()

Unnamed: 0,Year,Maternal Mortality Ratio (Gapminder (2010) and World Bank (2015)),"GDP per capita, PPP (constant 2017 international $)",Population (historical estimates)
count,57964.0,5800.0,6166.0,57172.0
mean,1601.738786,216.928822,17879.286262,43231850.0
std,1428.491685,297.109363,20132.683171,271768700.0
min,-10000.0,0.0,436.720367,0.0
25%,1832.0,13.0,3678.829102,139090.5
50%,1901.0,61.185,10365.77832,1312710.0
75%,1966.0,356.0,25832.031738,6061807.0
max,2021.0,2480.0,161971.46875,7909295000.0


In [28]:
maternal_deaths_df = maternal_deaths_df.rename(columns={"Entity":"country","Year":"year","Maternal Mortality Ratio (Gapminder (2010) and World Bank (2015))":"deaths_per_100,000_live_births","GDP per capita, PPP (constant 2017 international $)":"gdp_per_capita","Population (historical estimates)":"population"})

In [29]:
maternal_deaths_df.head(30)

Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
0,Abkhazia,2015,,,
1,Afghanistan,2000,1450.0,,19542986.0
2,Afghanistan,2001,1390.0,,19688634.0
3,Afghanistan,2002,1300.0,1189.784668,21000258.0
4,Afghanistan,2003,1240.0,1235.810059,22645136.0
5,Afghanistan,2004,1180.0,1200.277954,23553554.0
6,Afghanistan,2005,1140.0,1286.793701,24411196.0
7,Afghanistan,2006,1120.0,1315.789062,25442946.0
8,Afghanistan,2007,1090.0,1460.825806,25903306.0
9,Afghanistan,2008,1030.0,1484.114502,26427204.0


In [30]:
maternal_deaths_df["year"].max()

2021

In [31]:
maternal_deaths_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57964 entries, 0 to 57963
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country                         57964 non-null  object 
 1   year                            57964 non-null  int64  
 2   deaths_per_100,000_live_births  5800 non-null   float64
 3   gdp_per_capita                  6166 non-null   float64
 4   population                      57172 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 2.2+ MB


In [32]:
maternal_deaths_df.dropna()

Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
3,Afghanistan,2002,1300.0,1189.784668,21000258.0
4,Afghanistan,2003,1240.0,1235.810059,22645136.0
5,Afghanistan,2004,1180.0,1200.277954,23553554.0
6,Afghanistan,2005,1140.0,1286.793701,24411196.0
7,Afghanistan,2006,1120.0,1315.789062,25442946.0
...,...,...,...,...,...
57717,Zimbabwe,2013,509.0,3681.947266,13555420.0
57718,Zimbabwe,2014,494.0,3703.897217,13855758.0
57719,Zimbabwe,2015,480.0,3707.622559,14154937.0
57720,Zimbabwe,2016,468.0,3678.217041,14452705.0


In [33]:
maternal_deaths_df.head(30)

Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
0,Abkhazia,2015,,,
1,Afghanistan,2000,1450.0,,19542986.0
2,Afghanistan,2001,1390.0,,19688634.0
3,Afghanistan,2002,1300.0,1189.784668,21000258.0
4,Afghanistan,2003,1240.0,1235.810059,22645136.0
5,Afghanistan,2004,1180.0,1200.277954,23553554.0
6,Afghanistan,2005,1140.0,1286.793701,24411196.0
7,Afghanistan,2006,1120.0,1315.789062,25442946.0
8,Afghanistan,2007,1090.0,1460.825806,25903306.0
9,Afghanistan,2008,1030.0,1484.114502,26427204.0


In [34]:
maternal_deaths_df.dropna()

Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
3,Afghanistan,2002,1300.0,1189.784668,21000258.0
4,Afghanistan,2003,1240.0,1235.810059,22645136.0
5,Afghanistan,2004,1180.0,1200.277954,23553554.0
6,Afghanistan,2005,1140.0,1286.793701,24411196.0
7,Afghanistan,2006,1120.0,1315.789062,25442946.0
...,...,...,...,...,...
57717,Zimbabwe,2013,509.0,3681.947266,13555420.0
57718,Zimbabwe,2014,494.0,3703.897217,13855758.0
57719,Zimbabwe,2015,480.0,3707.622559,14154937.0
57720,Zimbabwe,2016,468.0,3678.217041,14452705.0


In [35]:
maternal_deaths_df.loc[maternal_deaths_df['year'].isin({2016})]


Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
17,Afghanistan,2016,673.0,2057.067871,3.463621e+07
513,Africa,2016,,,1.232112e+09
536,Albania,2016,16.0,12291.859375,2.881064e+06
795,Algeria,2016,113.0,11826.151367,4.033933e+07
1146,American Samoa,2016,,,5.046800e+04
...,...,...,...,...,...
56916,Western Sahara,2016,,,5.054610e+05
56938,World,2016,214.0,15803.528320,7.513474e+09
57198,Yemen,2016,165.0,,2.927401e+07
57460,Zambia,2016,222.0,3467.887451,1.676776e+07


In [36]:
maternal_deaths_df = maternal_deaths_df.loc[maternal_deaths_df['country'].isin(list_european_countries)]

In [37]:
maternal_deaths_df = maternal_deaths_df.loc[maternal_deaths_df['year'].isin({2016})]

In [38]:
maternal_deaths_df = maternal_deaths_df.dropna()

In [39]:
european_maternal_deaths_df = maternal_deaths_df.loc[maternal_deaths_df['country'].isin(list_european_countries)]

In [40]:
european_maternal_deaths_df.head()

Unnamed: 0,country,year,"deaths_per_100,000_live_births",gdp_per_capita,population
536,Albania,2016,16.0,12291.859375,2881064.0
3256,Austria,2016,5.7,53345.742188,8736491.0
4769,Belarus,2016,3.0,17883.126953,9708111.0
5158,Belgium,2016,4.1,49829.925781,11316837.0
6467,Bosnia and Herzegovina,2016,10.0,13194.331055,3480985.0


In [85]:
european_maternal_deaths_df.to_excel("european_maternal_deaths.xlsx", sheet_name="european_maternal_deaths")

## drop null values? filter to certain years/countries?

## new data: child mortality

In [41]:
child_mortality_df = pd.read_csv('data/child_mortality.csv', low_memory = False)

In [42]:
child_mortality_df.head()

Unnamed: 0,Entity,Code,Year,Child mortality rate - Sex: all - Age: 0-4 - Variant: estimates
0,Afghanistan,AFG,1950,41.94021
1,Afghanistan,AFG,1951,41.63049
2,Afghanistan,AFG,1952,40.986248
3,Afghanistan,AFG,1953,40.33584
4,Afghanistan,AFG,1954,39.781


In [43]:
child_mortality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18360 entries, 0 to 18359
Data columns (total 4 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Entity                                                           18360 non-null  object 
 1   Code                                                             17064 non-null  object 
 2   Year                                                             18360 non-null  int64  
 3   Child mortality rate - Sex: all - Age: 0-4 - Variant: estimates  18360 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 573.9+ KB


In [44]:
child_mortality_df.describe()

Unnamed: 0,Year,Child mortality rate - Sex: all - Age: 0-4 - Variant: estimates
count,18360.0,18360.0
mean,1985.5,8.70752
std,20.783171,8.961521
min,1950.0,0.14623
25%,1967.75,1.915498
50%,1985.5,5.062305
75%,2003.25,12.893562
max,2021.0,65.93338


In [45]:
child_mortality_df = child_mortality_df.drop("Code",axis=1)

In [46]:
child_mortality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18360 entries, 0 to 18359
Data columns (total 3 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Entity                                                           18360 non-null  object 
 1   Year                                                             18360 non-null  int64  
 2   Child mortality rate - Sex: all - Age: 0-4 - Variant: estimates  18360 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 430.4+ KB


In [47]:
child_mortality_df = child_mortality_df.rename(columns={"Entity":"country","Year":"year","Child mortality rate - Sex: all - Age: 0-4 - Variant: estimates":"deaths"})

In [48]:
child_mortality_df = child_mortality_df.round({"deaths":2})

In [49]:
child_mortality_df.head()

Unnamed: 0,country,year,deaths
0,Afghanistan,1950,41.94
1,Afghanistan,1951,41.63
2,Afghanistan,1952,40.99
3,Afghanistan,1953,40.34
4,Afghanistan,1954,39.78


In [50]:
child_mortality_df["country"].nunique()

255

In [51]:
child_mortality_df.dropna()

Unnamed: 0,country,year,deaths
0,Afghanistan,1950,41.94
1,Afghanistan,1951,41.63
2,Afghanistan,1952,40.99
3,Afghanistan,1953,40.34
4,Afghanistan,1954,39.78
...,...,...,...
18355,Zimbabwe,2017,5.57
18356,Zimbabwe,2018,5.32
18357,Zimbabwe,2019,5.31
18358,Zimbabwe,2020,5.23


In [52]:
child_mortality_df = child_mortality_df.loc[child_mortality_df.country.isin(list_european_countries)]

In [53]:
child_mortality_df = child_mortality_df.loc[child_mortality_df.year.isin({2021})]

In [54]:
european_child_mortality_df = child_mortality_df.loc[child_mortality_df.country.isin(list_european_countries)]

In [55]:
european_child_mortality_df.head()

Unnamed: 0,country,year,deaths
215,Albania,2021,1.0
431,Andorra,2021,0.67
1079,Austria,2021,0.36
1511,Belarus,2021,0.27
1583,Belgium,2021,0.4


In [86]:
european_child_mortality_df.to_excel("european_child_mortality.xlsx", sheet_name="european_child_mortality")

## new data: crime index

In [56]:
crime_index_df = pd.read_csv('data/world_crime_index.csv', low_memory = False)

In [57]:
crime_index_df.tail(20)

Unnamed: 0,Rank,City,Crime Index,Safety Index
433,434,"Groningen, Netherlands",20.8,79.2
434,435,"Tartu, Estonia",20.7,79.3
435,436,"Arhus, Denmark",20.6,79.4
436,437,"Muscat, Oman",20.54,79.46
437,438,"Basel, Switzerland",20.12,79.88
438,439,"Oradea, Romania",19.82,80.18
439,440,"Lugano, Switzerland",19.48,80.52
440,441,"Trondheim, Norway",19.41,80.59
441,442,"Eskisehir, Turkey",18.86,81.14
442,443,"Munich, Germany",18.66,81.34


In [58]:
crime_index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          453 non-null    int64  
 1   City          453 non-null    object 
 2   Crime Index   453 non-null    float64
 3   Safety Index  453 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 14.3+ KB


In [59]:
crime_index_df.head()

Unnamed: 0,Rank,City,Crime Index,Safety Index
0,1,"Caracas, Venezuela",83.98,16.02
1,2,"Pretoria, South Africa",81.98,18.02
2,3,"Celaya, Mexico",81.8,18.2
3,4,"San Pedro Sula, Honduras",80.87,19.13
4,5,"Port Moresby, Papua New Guinea",80.71,19.29


In [60]:
crime_index_df2 = crime_index_df.copy()
# #str(df2.columns).lower()
cols = crime_index_df2.columns.tolist()
cols = [col.lower() for col in cols]
cols = [col.replace(' ', '_') for col in cols]
# cols = [col.replace("#","num") for col in cols]

crime_index_df2.columns = cols
cols
crime_index_df2

Unnamed: 0,rank,city,crime_index,safety_index
0,1,"Caracas, Venezuela",83.98,16.02
1,2,"Pretoria, South Africa",81.98,18.02
2,3,"Celaya, Mexico",81.80,18.20
3,4,"San Pedro Sula, Honduras",80.87,19.13
4,5,"Port Moresby, Papua New Guinea",80.71,19.29
...,...,...,...,...
448,449,"Quebec City, Canada",15.14,84.86
449,450,"Taipei, Taiwan",15.05,84.95
450,451,"San Sebastian, Spain",14.86,85.14
451,452,"Doha, Qatar",13.96,86.04


In [61]:
crime_index_df2.city.values

array(['Caracas, Venezuela', 'Pretoria, South Africa', 'Celaya, Mexico',
       'San Pedro Sula, Honduras', 'Port Moresby, Papua New Guinea',
       'Durban, South Africa', 'Johannesburg, South Africa',
       'Kabul, Afghanistan', 'Rio de Janeiro, Brazil', 'Natal, Brazil',
       'Fortaleza, Brazil', 'Port Elizabeth, South Africa',
       'Recife, Brazil', 'Port of Spain, Trinidad And Tobago',
       'Baltimore, MD, United States', 'Salvador, Brazil',
       'Rosario, Argentina', 'Memphis, TN, United States',
       'Detroit, MI, United States', 'Rockhampton, Australia',
       'Cape Town, South Africa', 'Porto Alegre, Brazil',
       'Tijuana, Mexico', 'Kingston, Jamaica',
       'Bloemfontein, South Africa', 'Bradford, United Kingdom',
       'Albuquerque, NM, United States', 'Lima, Peru',
       'Guayaquil, Ecuador', 'Sao Paulo, Brazil',
       'Saint Louis, MO, United States', 'San Salvador, El Salvador',
       'Cali, Colombia', 'Mexico City, Mexico', 'Windhoek, Namibia',
       

In [62]:
type(crime_index_df2)

pandas.core.frame.DataFrame

In [63]:
crime_index_df2.head(20)

Unnamed: 0,rank,city,crime_index,safety_index
0,1,"Caracas, Venezuela",83.98,16.02
1,2,"Pretoria, South Africa",81.98,18.02
2,3,"Celaya, Mexico",81.8,18.2
3,4,"San Pedro Sula, Honduras",80.87,19.13
4,5,"Port Moresby, Papua New Guinea",80.71,19.29
5,6,"Durban, South Africa",80.6,19.4
6,7,"Johannesburg, South Africa",80.55,19.45
7,8,"Kabul, Afghanistan",79.39,20.61
8,9,"Rio de Janeiro, Brazil",77.93,22.07
9,10,"Natal, Brazil",77.69,22.31


In [64]:
crime_index_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          453 non-null    int64  
 1   city          453 non-null    object 
 2   crime_index   453 non-null    float64
 3   safety_index  453 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 14.3+ KB


In [65]:
city2 = crime_index_df2['city'].str.split(',').str[0]

country = crime_index_df2['city'].str.split(', ').str[-1]

In [66]:
type(country)

pandas.core.series.Series

In [67]:
crime_index_df2.insert(1, "city2", city2, True)

In [68]:
crime_index_df2.insert(2, "country", country, True)

In [69]:
crime_index_df2.head(20)

Unnamed: 0,rank,city2,country,city,crime_index,safety_index
0,1,Caracas,Venezuela,"Caracas, Venezuela",83.98,16.02
1,2,Pretoria,South Africa,"Pretoria, South Africa",81.98,18.02
2,3,Celaya,Mexico,"Celaya, Mexico",81.8,18.2
3,4,San Pedro Sula,Honduras,"San Pedro Sula, Honduras",80.87,19.13
4,5,Port Moresby,Papua New Guinea,"Port Moresby, Papua New Guinea",80.71,19.29
5,6,Durban,South Africa,"Durban, South Africa",80.6,19.4
6,7,Johannesburg,South Africa,"Johannesburg, South Africa",80.55,19.45
7,8,Kabul,Afghanistan,"Kabul, Afghanistan",79.39,20.61
8,9,Rio de Janeiro,Brazil,"Rio de Janeiro, Brazil",77.93,22.07
9,10,Natal,Brazil,"Natal, Brazil",77.69,22.31


In [70]:
crime_index_df3 = crime_index_df2.drop("city",axis=1)

In [71]:
crime_index_df3.head() 

Unnamed: 0,rank,city2,country,crime_index,safety_index
0,1,Caracas,Venezuela,83.98,16.02
1,2,Pretoria,South Africa,81.98,18.02
2,3,Celaya,Mexico,81.8,18.2
3,4,San Pedro Sula,Honduras,80.87,19.13
4,5,Port Moresby,Papua New Guinea,80.71,19.29


In [72]:
crime_index_df3 = crime_index_df3.rename(columns={"city2":"city"})

In [73]:
crime_index_df3.tail(20) 

Unnamed: 0,rank,city,country,crime_index,safety_index
433,434,Groningen,Netherlands,20.8,79.2
434,435,Tartu,Estonia,20.7,79.3
435,436,Arhus,Denmark,20.6,79.4
436,437,Muscat,Oman,20.54,79.46
437,438,Basel,Switzerland,20.12,79.88
438,439,Oradea,Romania,19.82,80.18
439,440,Lugano,Switzerland,19.48,80.52
440,441,Trondheim,Norway,19.41,80.59
441,442,Eskisehir,Turkey,18.86,81.14
442,443,Munich,Germany,18.66,81.34


In [74]:
crime_index_df3.head()

Unnamed: 0,rank,city,country,crime_index,safety_index
0,1,Caracas,Venezuela,83.98,16.02
1,2,Pretoria,South Africa,81.98,18.02
2,3,Celaya,Mexico,81.8,18.2
3,4,San Pedro Sula,Honduras,80.87,19.13
4,5,Port Moresby,Papua New Guinea,80.71,19.29


In [75]:
crime_index_df3.dropna()

Unnamed: 0,rank,city,country,crime_index,safety_index
0,1,Caracas,Venezuela,83.98,16.02
1,2,Pretoria,South Africa,81.98,18.02
2,3,Celaya,Mexico,81.80,18.20
3,4,San Pedro Sula,Honduras,80.87,19.13
4,5,Port Moresby,Papua New Guinea,80.71,19.29
...,...,...,...,...,...
448,449,Quebec City,Canada,15.14,84.86
449,450,Taipei,Taiwan,15.05,84.95
450,451,San Sebastian,Spain,14.86,85.14
451,452,Doha,Qatar,13.96,86.04


In [76]:
crime_index_df3.head()

Unnamed: 0,rank,city,country,crime_index,safety_index
0,1,Caracas,Venezuela,83.98,16.02
1,2,Pretoria,South Africa,81.98,18.02
2,3,Celaya,Mexico,81.8,18.2
3,4,San Pedro Sula,Honduras,80.87,19.13
4,5,Port Moresby,Papua New Guinea,80.71,19.29


## We will only look at european nations

In [77]:
list_european_countries = ('Albania',
'Andorra',
'Belgium',
'Bosnia and Herzegovina',
'Bulgaria',
'Denmark',
'Germany',
'Estonia',
'Finland',
'France',
'Greece',
'Ireland',
'Iceland',
'Italy',
'Kazakhstan',
'Kosovo',
'Croatia',
'Latvia',
'Liechtenstein',
'Lithuania',
'Luxembourg',
'Malta',
'Moldova',
'Monaco',
'Montenegro',
'Netherlands',
'North Macedonia',
'Norway',
'Austria',
'Poland',
'Portugal',
'Romania',
'Russia',
'San Marino',
'Sweden',
'Switzerland',
'Serbia',
'Slovakia',
'Slovenia',
'Spain',
'Czech Republic',
'Turkey',
'Ukraine',
'Hungary',
'Vatican',
'United Kingdom',
'Belarus')








In [78]:
#convert tuple to list for list_european_countries

list_european_countries = list(list_european_countries)

In [79]:
european_crime_index_df = crime_index_df3.loc[crime_index_df3["country"].isin(list_european_countries)]

In [80]:
european_crime_index_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 25 to 450
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          170 non-null    int64  
 1   city          170 non-null    object 
 2   country       170 non-null    object 
 3   crime_index   170 non-null    float64
 4   safety_index  170 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 8.0+ KB


In [81]:
european_crime_index_df.head(10)

Unnamed: 0,rank,city,country,crime_index,safety_index
25,26,Bradford,United Kingdom,71.24,28.76
37,38,Coventry,United Kingdom,68.35,31.65
46,47,Nantes,France,65.7,34.3
52,53,Almaty,Kazakhstan,64.17,35.83
58,59,Catania,Italy,63.51,36.49
63,64,Birmingham,United Kingdom,62.68,37.32
65,66,Marseille,France,62.51,37.49
78,79,Craiova,Romania,60.2,39.8
81,82,Naples,Italy,59.96,40.04
87,88,Nice,France,59.43,40.57


In [82]:
european_crime_index_df.tail(10)

Unnamed: 0,rank,city,country,crime_index,safety_index
435,436,Arhus,Denmark,20.6,79.4
437,438,Basel,Switzerland,20.12,79.88
438,439,Oradea,Romania,19.82,80.18
439,440,Lugano,Switzerland,19.48,80.52
440,441,Trondheim,Norway,19.41,80.59
441,442,Eskisehir,Turkey,18.86,81.14
442,443,Munich,Germany,18.66,81.34
443,444,Bern,Switzerland,17.94,82.06
444,445,Zurich,Switzerland,17.26,82.74
450,451,San Sebastian,Spain,14.86,85.14


In [83]:
#extract df to excel
european_crime_index_df.to_excel("european_crime_index.xlsx", sheet_name="european_crime_index")