# Inferential Stats Exploration

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bokeh.plotting as bkp
from mpl_toolkits.axes_grid1 import make_axes_locatable
import scipy
from scipy import stats
import statsmodels
import statsmodels.stats.weightstats
import seaborn as sns

%pylab inline

#wrangled data imported into a pandas dataframe
df = pd.read_csv('~/Documents/Repository/Capstone-1_WorldBank_GenderData/wrangled_data.csv')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [54]:
#test to verify that the distribution of Life Expectancy is normal
life = df['Life expectancy at birth, total (years)'].dropna()
scipy.stats.normaltest(life, axis=0)

NormaltestResult(statistic=294.1448406922247, pvalue=1.3404771135878002e-64)

In [55]:
df['Life expectancy at birth, total (years)'].isnull().sum()

246

In [56]:
#test to verify that the distribution of Contraceptive prevalence is normal
scipy.stats.normaltest(df['Contraceptive prevalence, any methods (% of women ages 15-49)'], axis=0, nan_policy='omit')

NormaltestResult(statistic=264.95699668268827, pvalue=2.9195720645692804e-58)

In [57]:
df['Contraceptive prevalence, any methods (% of women ages 15-49)'].isnull().sum()

3306

In [61]:
corr = pd.DataFrame()
corr['country'] = df['Country.Name']
corr['life'] = df['Life expectancy at birth, total (years)']
corr['bc'] = df['Contraceptive prevalence, any methods (% of women ages 15-49)']
corr['teen'] = df['Adolescent fertility rate (births per 1,000 women ages 15-19)']
corr['matdeath'] = df['Maternal mortality ratio (modeled estimate, per 100,000 live births)']
corr['eduspend'] = df['Public spending on education, total (% of GDP)']

In [62]:
#clean = corr.dropna(subset= ['life','bc'])
clnlife = corr[np.isfinite(corr['life'])]

In [60]:
scipy.stats.pearsonr(clean.bc,clean.life)

AttributeError: 'Series' object has no attribute 'bc'

In [None]:
# visualize the relationship between independent variables and Life Expectancy using scatterplots
fig, axs = plt.subplots(1, 4, sharey=True)
df.plot(kind='scatter', x='Contraceptive prevalence, any methods (% of women ages 15-49)', y='Life expectancy at birth, total (years)', ax=axs[0], figsize=(20, 8))
df.plot(kind='scatter', x='Adolescent fertility rate (births per 1,000 women ages 15-19)', y='Life expectancy at birth, total (years)', ax=axs[1])
df.plot(kind='scatter', x='Maternal mortality ratio (modeled estimate, per 100,000 live births)', y='Life expectancy at birth, total (years)', ax=axs[2])
df.plot(kind='scatter', x='Public spending on education, total (% of GDP)', y='Life expectancy at birth, total (years)', ax=axs[3])


In [None]:
import statsmodels.formula.api as smf

# create a fitted model in one line
contra = smf.ols(formula='life ~ bc', data=corr).fit()
# print the coefficients
contra.params

In [None]:
adole = smf.ols(formula='life ~ teen', data=corr).fit()
adole.params

In [None]:
momdead = smf.ols(formula='life ~ matdeath', data=corr).fit()
momdead.params

In [None]:
edu = smf.ols(formula='life ~ eduspend', data=corr).fit()
edu.params

In [None]:
mod = smf.ols(formula='life ~ bc + teen + matdeath + eduspend', data=corr)
res = mod.fit()
res.summary()

In [None]:
#maternal death vs literacy rate
death, axis = plt.subplots(1, 4, sharey=True)
df.plot(kind='scatter', x='Literacy rate, adult female (% of females ages 15 and above)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axis[0], figsize=(20, 8))
df.plot(kind='scatter', x='Literacy rate, adult male (% of males ages 15 and above)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axis[1])
df.plot(kind='scatter', x='Literacy rate, youth female (% of females ages 15-24)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax =axis[2])
df.plot(kind='scatter', x='Literacy rate, youth male (% of males ages 15-24)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axis[3])
axis[0].set_xlim([0, 100])
axis[0].set_ylim([0, 1000])
axis[1].set_xlim([0, 100])
axis[1].set_ylim([0, 1000])
axis[2].set_xlim([0, 100])
axis[2].set_ylim([0, 1000])
axis[3].set_xlim([0, 100])
axis[3].set_ylim([0, 1000])

In [None]:
corr['litfemale'] = df['Literacy rate, adult female (% of females ages 15 and above)']
corr['litmale'] = df['Literacy rate, adult male (% of males ages 15 and above)']
corr['youthfemale'] = df['Literacy rate, youth female (% of females ages 15-24)']
corr['youthmale'] = df['Literacy rate, youth male (% of males ages 15-24)']

femlit = smf.ols(formula='litfemale ~ matdeath', data=corr).fit()
femlit.params

In [None]:
mlit = smf.ols(formula='litmale ~ matdeath', data=corr).fit()
mlit.params

In [None]:
yfemlit = smf.ols(formula='youthfemale ~ matdeath', data=corr).fit()
yfemlit.params

In [None]:
ymlit = smf.ols(formula='youthmale ~ matdeath', data=corr).fit()
ymlit.params

In [None]:
# Scatter plots of Adolescent Fertility vs Contraceptive use and Public Education Spending
fig, axs = plt.subplots(1, 2, sharey=True)
df.plot(kind='scatter', x='Contraceptive prevalence, any methods (% of women ages 15-49)', y='Adolescent fertility rate (births per 1,000 women ages 15-19)', ax=axs[0], figsize=(20, 8))
df.plot(kind='scatter', x='Public spending on education, total (% of GDP)', y='Adolescent fertility rate (births per 1,000 women ages 15-19)', ax=axs[1])

In [None]:
#Adolescent Fertility dependence on Contraceptive Prevalence
teenbc = smf.ols(formula='teen ~ bc', data=corr).fit()
teenbc.params

In [None]:
#Adolescent Fertility dependence on Public Education Spending
teenedu = smf.ols(formula='teen ~ eduspend', data=corr).fit()
teenedu.params

In [None]:
teenfert = smf.ols(formula='teen ~ bc + eduspend', data=corr)
line = teenfert.fit()
line.summary()

In [None]:
# visualize the relationship between Maternal Mortality vs Public Education Spending, Contraceptive use, and Adolescent Fertility using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
df.plot(kind='scatter', x='Public spending on education, total (% of GDP)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axs[0], figsize=(20, 8))
df.plot(kind='scatter', x='Contraceptive prevalence, any methods (% of women ages 15-49)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axs[1])
df.plot(kind='scatter', x='Adolescent fertility rate (births per 1,000 women ages 15-19)', y='Maternal mortality ratio (modeled estimate, per 100,000 live births)', ax=axs[2])

In [None]:
# visualize the relationship between  using scatterplots
Ed_BC = df.plot(kind='scatter', x='Public spending on education, total (% of GDP)', y='Contraceptive prevalence, any methods (% of women ages 15-49)', ax=axs[2])
Ed_BC

In [None]:
'''corr = pd.DataFrame()
corr['country'] = df['Country.Name']
corr['life'] = df['Life expectancy at birth, total (years)']
corr['bc'] = df['Contraceptive prevalence, any methods (% of women ages 15-49)']
corr['teen'] = df['Adolescent fertility rate (births per 1,000 women ages 15-19)']
corr['matdeath'] = df['Maternal mortality ratio (modeled estimate, per 100,000 live births)']
corr['eduspend'] = df['Public spending on education, total (% of GDP)']
'''
