In [4]:
import pandas as pd
import matplotlib.pyplot as plt
# import geopandas

# for extracting the population data
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize

# to read in shape file and provides high
# level interface with
# matplotlib library for making maps
import geopandas as gpd

df = pd.read_csv('/home/fede/Documents/Learn/Datacamp/'
                 'Python/Cleaning_Data/tb.csv')
df.head()

ModuleNotFoundError: No module named 'geopandas'

In [None]:
df.describe()

Using describe, we see that our dataset has only one year: 2000, and each row represent a different country.

In [None]:
df.columns

In [None]:
df.shape

Using shape, we see that we have 201 different countries represented in our dataset.

## Preparing Data for Analysis: Reshaping Data

This data is untidy and needs to be converted to be suitable for analysis. Acording to [Wickham 2014](https://vita.had.co.nz/papers/tidy-data.pdf), the standard way to organize values within a dataset is as follows:

* Rows represent individual observations.
* Columns represent separate variables.
* Each type of observational unit forms a table.

In this dataset, **columns** do not represent separate variables. 

And each **row** contains more than one observation. In the original dataset, each row represent a country-year combination.

To fix this problem, I will use the pandas function **[melt](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html)**.

In our dataset, the identifier columns are country and year. The rest of the columms hold values for each country-year combinations and will be melted down.

In [None]:
df = df.melt(id_vars=['country','year'],value_name='cases')
df.head()

Now, each row represents the number of cases for a different country.

In [None]:
df.shape

## Extract the sex and age group from the variable column

Each value of the 'variable' column contains the sex and the age group. This shape will not allow us to fix a model using sex and age as independent predictors.

The sex is represented by the first character of each value. I'll create a new column called 'sex' that will be the first character of the 'variable' column. For this, I will use the 'str' attibute, to treat the column as a string, and afterwards I will slice the string to extact the first character.

In [None]:
df['sex'] = df.variable.str[:1]
df.head()

Regarding the age groups, I'll use the same procedure to extract the rest of the characters from the 'variable' column.

In [None]:
df['age_group'] = df.variable.str[1:]
df.head()

Now, I just drop the 'variable' columns, since I've just extracted all the info from it into two different columns: 'sex' and 'age_group'

In [None]:
df.drop(columns=['variable'], inplace=True)

In [None]:
df.head(10)

There are NaN values in cases, let's remove them using pandas dropna.

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
df.head(10)

In [None]:
type(df['cases'][0])

The 'cases' column contains numerical data, but is represente with a string. I'll convert it to an integer using the Pandas to_numeric. 

In [None]:
#df['cases'] = pd.to_numeric(df['cases'],int)
#df.head(10)

In [None]:
df.country[:10]

Country names are codified using ISO alpha-2 codes, while not strictly necessary, I'll convert the codes to country names.

First, I will extact an html table that contains the alpha-2 codes from Wikipedia, and convert this data to a dataframe that will have only 2 columns: Code and Country Name.

In [None]:
countries = pd.read_html('https://en.wikipedia.org/wiki/ISO_3166-1')[1]

In [None]:
countries.head()

In [None]:
countries = countries[countries.columns[:3]]

In [None]:
countries.head()

In [None]:
countries.columns = ['country_name','country-code2','country-code3']

Now, I will merge this dataset with the original dataset.

In [None]:
df = df.merge(countries,left_on='country',right_on='country-code2').drop(columns=['country'])

In [None]:
df.head()

I will extract the population data from a json available on GitHub. I convert this data to a new dataframe, containing only two columns: country code and population.

In [None]:
population_url = 'https://gist.githubusercontent.com/gwillem/6ca8a81048e6f3721c3bafc803d44a72/raw/4fb66d18178c1a0fdf101fb6b03c4d21929472da/iso2_population.json'

In [None]:
response = urlopen(population_url)
population_json_data = response.read().decode('utf-8', 'replace')

population_json = json.loads(population_json_data)
population_df = pd.DataFrame.from_dict(population_json,orient='index')
population_df.columns = ['pop']

In [None]:
population_df.head()

In [None]:
type(population_df.iloc[0,0])

The population is represented as a string, I convert it to integer using to_numeric.

In [None]:
population_df['pop'] = pd.to_numeric(population_df['pop'])


In [None]:
population_df.head()

In [None]:
type(population_df.iloc[0,0])

Now, I merge the population data with the original dataset, using the ISO 2-char code columns.

In [None]:
df.head()

In [None]:
df = df.merge(population_df,left_on='country-code2',right_index=True)

In [None]:
df.head()

# Exploartory Data Analysis

First, I will find the total number of cases per country. Because the dataset has several rows for each country, each row having data for each sex and age group combination, I will use the Pandas groupby operation to just sum all the cases for each country. 

In [None]:
# country
countries = df.groupby(['country_name','country-code3']).sum()
countries.reset_index(inplace=True)
countries.head()

The number of cases per country is not meaningful if we don't take the population into account. Now, I will find the tuberculosis rate per country. 

To do this, I divide the number of cases by the population and multiply this for one million, in order to find the rate per million of people.

Afterwards, I sort the data by the rate, to find out the countries with the highest rates of tuberculosis.

In [None]:
countries['rate'] = countries['cases'] / countries['pop'] * (10**6)
countries = countries.sort_values(by='rate',ascending=False)
countries[:5]

In [None]:
countries[:10].plot(kind='bar',x='country_name',y='rate')

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world = world[(world.name != "Antarctica") & (world.name != "Fr. S. Antarctic Lands")]  # exclude 2 no-man lands

In [None]:
merged = world.merge(countries,left_on='iso_a3',right_on='country-code3')
merged.head()

In [None]:
vmin, vmax = 0,max(merged.rate)
fig, ax = plt.subplots(1, figsize=(18, 8))
ax.set_title('Tuberculosis Rate - World')
ax.axis('off')
ax.annotate('Number of Cases per Million\nSource: World Health Organisation',xy=(0.1, .08),  xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=12, color='#555555')

sm = plt.cm.ScalarMappable(cmap='binary', norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm)

merged.plot(column='rate', cmap='binary', linewidth=0.8, ax=ax, edgecolor='0.8')

In [None]:
africa = merged[merged['continent'] == "Africa"] # only africa
vmin, vmax = 0,max(africa.rate)
fig, ax = plt.subplots(1, figsize=(18, 8))
ax.set_title('Tuberculosis Rate - Africa')
ax.axis('off')
ax.annotate('Number of Cases per Million\nSource: World Health Organisation',
            xy=(0.1, .08),
            xycoords='figure fraction',
            horizontalalignment='left',
            verticalalignment='top',
            fontsize=12,color='#555555')

sm = plt.cm.ScalarMappable(cmap='binary', norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm)

africa.plot(column='rate', cmap='binary', linewidth=0.8, ax=ax, edgecolor='0.8')

In [None]:
americas = merged[(merged['continent'] == "South America") | (merged['continent'] == "North America")]
vmin, vmax = 0,max(africa.rate)
fig, ax = plt.subplots(1, figsize=(18, 8))
ax.set_title('Tuberculosis Rate - Americas')
ax.axis('off')
ax.annotate('Number of Cases per Million\nSource: World Health Organisation',
            xy=(0.1, .08),
            xycoords='figure fraction',
            horizontalalignment='left',
            verticalalignment='top',
            fontsize=12,color='#555555')

sm = plt.cm.ScalarMappable(cmap='binary', norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm)

americas.plot(column='rate', cmap='binary', linewidth=0.8, ax=ax, edgecolor='0.8')

In [None]:
asia = merged[merged['continent'] == "Asia"] 
vmin, vmax = 0,max(africa.rate)
fig, ax = plt.subplots(1, figsize=(18, 8))
ax.set_title('Tuberculosis Rate - Asia')
ax.axis('off')
ax.annotate('Number of Cases per Million\nSource: World Health Organisation',
            xy=(0.1, .08),
            xycoords='figure fraction',
            horizontalalignment='left',
            verticalalignment='top',
            fontsize=12,color='#555555')

sm = plt.cm.ScalarMappable(cmap='binary', norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm)

asia.plot(column='rate', cmap='binary', linewidth=0.8, ax=ax, edgecolor='0.8')