# Analyze World Population Data

## Analysis

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import statistics as st
import matplotlib.pyplot as plt
import scipy

# From https://drive.google.com/file/d/181fFa4h4EigLpMlyu3DXaptm41tXVrNS/view
df = pd.read_csv(
    "https://drive.google.com/uc?id=181fFa4h4EigLpMlyu3DXaptm41tXVrNS",
    index_col=0,
)
df.shape

In [None]:
df.columns

In [None]:
df[df.columns[::-1]].head()

In [17]:
df['education_expenditure_per_inhabitant'].str.replace(',','')
df['education_expenditure_per_inhabitant'] = df['education_expenditure_per_inhabitant'].str.replace(',','')
df['education_expenditure_per_inhabitant'] = df['education_expenditure_per_inhabitant'].astype('Int64')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(25,10))
sns.heatmap(df.corr(method="kendall"), annot=True, fmt=".3f", ax=ax);

In [None]:
correlations1 = df.corr()['iq'].dropna()
print(correlations1.sort_values(ascending=False))

Testing Hypotheses

In [22]:
columns_of_interest1 = ['iq', 'health', 'birth_rate',
        'male_life_expectancy', 'female_life_expectancy', 
        'death_rate', 'male_height', 'female_height', 
        'male_weight', 'female_weight','male_bmi', 'female_bmi']
df_iq_health = df[columns_of_interest1]

In [None]:
fig, ax = plt.subplots(figsize=(25,10))
sns.heatmap(df_iq_health.corr(method="kendall"), annot=True, fmt=".3f", ax=ax)


In [None]:
sns.lmplot(data=df_iq_health, x= 'iq', y= 'female_life_expectancy', line_kws = {'color': 'red'} )


In [None]:
sns.lmplot(data=df_iq_health, x= 'iq', y= 'male_life_expectancy', line_kws = {'color': 'purple'})

In [None]:
sns.lineplot(data = df_iq_health, x= 'iq', y= 'birth_rate', color = 'green' )
sns.regplot(data = df_iq_health, x= 'iq', y= 'birth_rate' )
plt.xlabel('IQ')
plt.ylabel('Birth Rate')
plt.title('Relationship between IQ and Birth Rate')
plt.show()

In [None]:
sns.lineplot(data = df_iq_health, x= 'iq', y= 'death_rate', color = 'black' )
sns.regplot(data = df_iq_health, x= 'iq', y= 'death_rate' )
plt.xlabel('IQ')
plt.ylabel('Death Rate')
plt.title('Relationship between IQ and Death Rate')
plt.show()

In [None]:
sns.barplot(data = df_iq_health, x='female_life_expectancy', y= 'iq')
plt.title('Distribution of IQ Scores for Different Categories of Life Expectancy')
plt.show()

In [None]:
df['iq'].min()

In [None]:
sns.lineplot(data = df_iq_health, x='female_life_expectancy', y= 'iq')

In [None]:
columns_of_interest2 = ['iq',  'health', 'rights',  'daily_max_temp', 
        'stability','safety','education_expenditure_per_inhabitant', 
        'climate', 'costs', 'popularity']
df_iq_quality = df[columns_of_interest2]
fig, ax = plt.subplots(figsize=(25,10))
sns.heatmap(df_iq_quality.corr(method="kendall"), annot=True, fmt=".3f", ax=ax)

In [None]:
sns.lineplot(data = df_iq_quality, x= 'iq', y= 'safety', color = 'red' )
sns.regplot(data = df_iq_quality, x= 'iq', y= 'safety' )
plt.xlabel('IQ')
plt.ylabel('Safety')
plt.title('Relationship between IQ and Safety')
plt.show()

In [None]:
sns.lineplot(data = df_iq_quality, x= 'iq', y= 'stability', color = 'pink' )
sns.regplot(data = df_iq_quality, x= 'iq', y= 'stability' )
plt.xlabel('IQ')
plt.ylabel('Stability')
plt.title('Relationship between IQ and Stability')
plt.show()

In [None]:
sns.lineplot(data = df_iq_quality, x= 'iq', y= 'rights', color = 'yellow' )
sns.regplot(data = df_iq_quality, x= 'iq', y= 'rights' )
plt.xlabel('IQ')
plt.ylabel('Rights')
plt.title('Relationship between IQ and Rights')
plt.show()

In [None]:
correlations2 = df.corr()['education_expenditure_per_inhabitant'].dropna()
print(correlations2.sort_values(ascending=False))

In [None]:
columns_of_interest3 = ['education_expenditure_per_inhabitant',  'rights', 
        'stability', 'health', 'iq', 'daily_max_temp', 'climate', 'costs', 
        'safety', 'popularity']
df_edexp_quality = df[columns_of_interest3]
sns.heatmap(df_edexp_quality.corr(), 
            xticklabels=True, 
            yticklabels=True,
            annot=True);

In [None]:
columns_of_interest4 = ['health', 
        'birth_rate', 'male_life_expectancy', 'female_life_expectancy', 
        'male_height', 'female_height', 'male_weight', 'female_weight',
        'male_bmi', 'female_bmi', 'death_rate']
df_edexp_health = df[columns_of_interest4]
sns.heatmap(df_edexp_health.corr(), 
            xticklabels=True, 
            yticklabels=True,
            annot=True);

In [None]:
# calculate correlation coeff between health and all other variables and sort
correlations3 = df.corr()['health'].dropna()
print(correlations3.sort_values(ascending=False))

In [None]:
# sort using absolute value to rank strongest correlations
correlations3 = df.corr()['health'].dropna()
print(abs(correlations3).sort_values(ascending=False))

In [None]:
# create dataframe with variables of interest
columns_of_interest5 = ['health', 'birth_rate','male_life_expectancy', 
                        'female_life_expectancy','iq', 'male_height', 
                        'female_height', 'male_weight', 'female_weight',
                        'male_bmi', 'female_bmi','death_rate']
df_health = df[columns_of_interest5]

# create heat map with variables of interest - Kendall's tau is not affected by outliers
fig, ax = plt.subplots(figsize=(25,10))
sns.heatmap(df_health.corr(method="kendall"), annot=True, fmt=".3f", ax=ax)
plt.title('Relationship between Health and Health Metrics')

In [None]:
fig, ax = plt.subplots(figsize=(25,10))
sns.heatmap(df_health.corr(), 
            xticklabels=True, 
            yticklabels=True,
            annot=True);

In [None]:
sns.lmplot(data = df_health, x= 'health', y= 'female_life_expectancy', line_kws = {'color': 'orange'} )
plt.xlabel('Health')
plt.ylabel('Female Life Expectancy')
plt.title('Relationship between Health and Female Life Expectancy')
plt.show()


In [None]:
sns.lmplot(data = df_health, x= 'health', y= 'male_life_expectancy', line_kws = {'color': 'purple'} )
plt.xlabel('Health')
plt.ylabel('Male Life Expectancy')
plt.title('Relationship between Health and Male Life Expectancy')
plt.show()

In [None]:
sns.lmplot(data = df_health, x= 'health', y= 'iq', line_kws = {'color': 'turquoise'} )
plt.xlabel('Health')
plt.ylabel('IQ')
plt.title('Relationship between Health and IQ')
plt.show()

In [None]:
sns.lmplot(data = df_health, x= 'health', y= 'birth_rate', line_kws = {'color': 'green'} )
plt.xlabel('Health')
plt.ylabel('Birth Rate')
plt.title('Relationship between Health and Birth Rate')
plt.show()

In [None]:
sns.lmplot(data = df_health, x= 'health', y= 'death_rate', line_kws = {'color': 'black'} )
plt.xlabel('Health')
plt.ylabel('Death Rate')
plt.title('Relationship between Health and Death Rate')
plt.show()

In [None]:
sns.lineplot(data = df_health, x='health', y= 'female_life_expectancy', color = 'orange')
sns.regplot(data = df_health, x='health', y= 'female_life_expectancy')
plt.xlabel('Health')
plt.ylabel('Female Life Expectancy')
plt.title('Relationship between Health and Female Life Expectancy')
plt.show()

In [None]:
sns.lineplot(data = df_health, x='health', y= 'iq')

In [None]:
columns_of_interest6 = ['health', 'rights','stability','safety',
                        'education_expenditure_per_inhabitant', 'climate',
                        'costs', 'popularity']
df_health_qol = df[columns_of_interest6]
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(df_health_qol.corr(), 
            xticklabels=True, 
            yticklabels=True,
            annot=True);

In [None]:
sns.lmplot(data = df_health_qol, x= 'health', y= 'rights', line_kws = {'color': 'blue'})
plt.xlabel('Health Score')
plt.ylabel('Rights Score')
plt.title('Relationship between Health and Rights')
plt.show()

In [None]:
sns.lmplot(data = df_health_qol, x= 'health', y= 'stability', line_kws = {'color': 'pink'})
plt.xlabel('Health Score')
plt.ylabel('Stability Score')
plt.title('Relationship between Health and Stability')
plt.show()

In [None]:
sns.lmplot(data = df_health_qol, x= 'health', y= 'safety', line_kws = {'color': 'red'})
plt.xlabel('Health Score')
plt.ylabel('Safety Score')
plt.title('Relationship between Health and Safety')
plt.show()

In [None]:
sns.displot(x = df["safety"])

In [None]:
df_health.describe()

In [None]:
df['health'].describe()

In [None]:
sns.displot(x = df["health"])

In [None]:
df["health"].dropna().sort_values(ascending=False)

In [None]:
scatterplot1 = sns.scatterplot(x = 'health', y = 'health', data = df)

In [None]:
# calculate correlation coeff between health and all other variables and sort
correlations4 = df.corr()['female_bmi'].dropna()
print(correlations4.sort_values(ascending=False))
print(abs(correlations4.sort_values(ascending=False)))