## This script contains:
#### 01. Import libraries
#### 02. Import data
#### 03. Data wrangling
#### 04. Consistency checks
#### 05. Plotting a choropleth
#### 06. Results

# 01. Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json
import matplotlib.pyplot as plt

In [None]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

# 02. Import data

In [None]:
# Import ".json" file for the world countries

country_geo = r'C:\Users\ejgor\OneDrive\Data Analytics\Data Immersion\Achievement 6 - Advanced Analytics & Dashboard Design\World Happiness Report Analysis\02 Data\Original Data\countries.geojson'

In [None]:
import json

f = open(r'C:\Users\ejgor\OneDrive\Data Analytics\Data Immersion\Achievement 6 - Advanced Analytics & Dashboard Design\World Happiness Report Analysis\02 Data\Original Data\countries.geojson')

# returns JSON object as a dictionary
data = json.load(f)

# Iterating through the json list
for feature in data['features']:
    properties = feature['properties']
    print(properties)

In [None]:
# create path to folder

path = r'C:\Users\ejgor\OneDrive\Data Analytics\Data Immersion\Achievement 6 - Advanced Analytics & Dashboard Design\World Happiness Report Analysis'

In [None]:
# import cleaned dataframe

df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'WHR_2015-2019.csv'), index_col = False)

In [None]:
# check if df was imported correctly

df.shape

In [None]:
df.info()

In [None]:
df.head()

# 03. Data wrangling

In [None]:
# change country names to match JSON

df2 = df.replace('United States', 'United States of America')
df3 = df2.replace('Tanzania', 'United Republic of Tanzania')
df4 = df3.replace('Congo (Brazzaville)', 'Republic of Congo')
df5 = df4.replace('Congo (Kinshasa)', 'Democratic Republic of the Congo')

In [None]:
# select only the necessary columns and put them in a list called columns

columns = ['Country', 'Rank', 'Score', 'Support', 'GDP', 'Health', 'Freedom', 'Generosity', 'Corruption', 'Year']

In [None]:
# create a subset

country_rec = df5[columns]

In [None]:
country_rec.head()

# 04. Consistency checks

In [None]:
# check for missing values

country_rec.isnull().sum()

#### No missing values

In [None]:
# check for duplicates

dups = country_rec.duplicated()

In [None]:
dups.shape

#### No duplicate values

In [None]:
# extreme values check

sns.histplot(country_rec['Score'], bins=25, kde = True)

#### Normal distribution

In [None]:
# extreme values check

sns.histplot(country_rec['Support'], bins=25, kde = True)

In [None]:
# extreme values check

sns.histplot(country_rec['GDP'], bins=25, kde = True)

#### Left skewed distribution

In [None]:
# extreme values check

sns.histplot(country_rec['Health'], bins=25, kde = True)

In [None]:
# extreme values check

sns.histplot(country_rec['Freedom'], bins=25, kde = True)

#### Left skewed distribution

In [None]:
# extreme values check

sns.histplot(country_rec['Generosity'], bins=25, kde = True)

#### Right skewed distribution

In [None]:
# extreme values check

sns.histplot(country_rec['Corruption'], bins=25, kde = True)

#### Right skewed distribution
#### No extreme values for each variable

# 05. Plotting a choropleth

## Country and Happiness Score

In [None]:
# subset the data for the year 2015

df_2015 = country_rec[country_rec["Year"] == 2015]

In [None]:
df_2015.head()

In [None]:
for feature in data['features']:
    properties = feature['properties']
    print(properties)

In [None]:
# set up a folium map at a high-level zoom

map1 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2015,
    columns = ['Country', 'Score'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map1)

folium.LayerControl().add_to(map1)

map1

#### In 2015, North America, Australia, parts of South America, and parts of Europe have the highest happiness score. South Asia and Africa have the lowest happiness score.

In [None]:
map1.save('plot_data1.html')

In [None]:
# subset the data for the year 2016

df_2016 = country_rec[country_rec["Year"] == 2016]

In [None]:
df_2016.head()

In [None]:
# set up a folium map at a high-level zoom

map2 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2016,
    columns = ['Country', 'Score'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map2)

folium.LayerControl().add_to(map2)

map2

#### In 2016, the same continents (countries) have the highest happiness score, but parts of southern Asia got a better happiness score.

In [None]:
map2.save('plot_data2.html')

In [None]:
# subset the data for the year 2017

df_2017 = country_rec[country_rec["Year"] == 2017]

In [None]:
df_2017.head()

In [None]:
# set up a folium map at a high-level zoom

map3 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2017,
    columns = ['Country', 'Score'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map3)

folium.LayerControl().add_to(map3)

map3

#### In 2017, the same continents (countries) have the highest happiness score. Asia's happiness score went up, but Brazil and Mexico's happiness score dropped.

In [None]:
map3.save('plot_data3.html')

In [None]:
# subset the data for the year 2018

df_2018 = country_rec[country_rec["Year"] == 2018]

In [None]:
df_2018.head()

In [None]:
# set up a folium map at a high-level zoom

map4 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2018,
    columns = ['Country', 'Score'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map4)

folium.LayerControl().add_to(map4)

map4

#### In 2018, the same continents (countries) have the highest happiness score, but Asia dropped in happiness score.

In [None]:
map4.save('plot_data4.html')

In [None]:
# subset the data for the year 2019

df_2019 = country_rec[country_rec["Year"] == 2019]

In [None]:
df_2019.head()

In [None]:
# set up a folium map at a high-level zoom

map5 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2019,
    columns = ['Country', 'Score'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map5)

folium.LayerControl().add_to(map5)

map5

#### In 2019, the same continents (countries) have the highest happiness score, but the US dropped in happiness score.

In [None]:
map5.save('plot_data5.html')

## Country and GDP

In [None]:
# set up a folium map at a high-level zoom

map6 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2015,
    columns = ['Country', 'GDP'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map6)

folium.LayerControl().add_to(map6)

map6

#### In 2015, the countries with high GDP were Norway followed by the US, Canada, Australia, and some of Europe and Asia.

In [None]:
map6.save('plot_data6.html')

In [None]:
# set up a folium map at a high-level zoom

map7 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2016,
    columns = ['Country', 'GDP'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map7)

folium.LayerControl().add_to(map7)

map7

#### In 2016, the countries with high GDP were Norway followed by the US, Canada, Australia, and some of Europe and Asia.

In [None]:
map7.save('plot_data7.html')

In [None]:
# set up a folium map at a high-level zoom

map8 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2017,
    columns = ['Country', 'GDP'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map8)

folium.LayerControl().add_to(map8)

map8

#### In 2017, Mexico's GDP slighlty increased.

In [None]:
map8.save('plot_data8.html')

In [None]:
# set up a folium map at a high-level zoom

map9 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2018,
    columns = ['Country', 'GDP'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map9)

folium.LayerControl().add_to(map9)

map9

#### In 2018, the US, Saudi Arabia, and Norway went up in GDP.

In [None]:
map9.save('plot_data9.html')

In [None]:
# set up a folium map at a high-level zoom

map10 = folium.Map(location=[100, 0], zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries. This allows us to quickly visualize data combinations

folium.Choropleth(
    geo_data = country_geo, 
    data = df_2019,
    columns = ['Country', 'GDP'],
    key_on = 'feature.properties.ADMIN',
    fill_color = 'YlOrBr',
    fill_opacity = 0.6,
    line_opacity = 0.1,
    legend_name = "Happiness Score",
    highlight = True,
    nan_fill_color = 'gray',
    reset = True
).add_to(map10)

folium.LayerControl().add_to(map10)

map10

#### In 2019, the US and Norway go up in GDP.

In [None]:
map10.save('plot_data10.html')

# 06. Results

#### Does the analysis answer any of your existing research questions?
    How does the distribution of Happiness Score vary across different continents or regions? Countries with the highest happiness score were usually Canada, the US, Australia, New Zealand, Norway, Sweden, Finland, Germany, United Kigdom, Ireland, and the Netherlands. Countries with the lowest score are in some African and South Asian countries. Over the years, until 2019, the US dropped in happiness score.
    
    How does the distribution of GDP vary across different continents or regions? From 2015-2017, we see the country with the highest GDP was Norway followed by the US, Canada, Australia, and some of Europe and Asia. In 2018-2019, the US and Norway are highest in GDP.
    
    How does the distribution of social support vary across different continents or regions? From 2015-2019, countries with the highest support are Canada, the US, South America, Australia, Europe and Asia.
    
    How does the distribution of high life expectancy (health) vary across different continents or regions? In 2015, countries with high life expectancy are Canada, the US, Australia, and some of Europe. In 2019, the US life expectancy drops.

#### Does the analysis lead you to any new research questions?
    How does GDP, Support, and life expectancy (Health) vary over the years for these countries?