# Inferential Stats Exploration

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing as pp
from sklearn import linear_model as lm
%pylab inline

#wrangled data imported into a pandas dataframe
df = pd.read_csv('~/Documents/Repository/Capstone-1_WorldBank_GenderData/wrangled_data.csv')

In [5]:
#create a new df with fewer variables to build models with
focus = pd.DataFrame()
focus['life'] = df['Life expectancy at birth, total (years)']
focus['bc'] = df['Contraceptive prevalence, any methods (% of women ages 15-49)']
focus['matdeath'] = df['Maternal mortality ratio (modeled estimate, per 100,000 live births)']
focus['teen'] = df['Adolescent fertility rate (births per 1,000 women ages 15-19)']
focus['gdp'] = df['GDP per capita (Current US$)']
focus['healthspend'] = df['Health expenditure, public (% of GDP)']
focus['eduspend'] = df['Public spending on education, total (% of GDP)']

focus.to_csv('finalvars.csv')

In [6]:
#visualized 'heatmap' of correlations between the variables in the focus df
corrmat= focus.corr()
sns.heatmap(corrmat, vmax=.8, square=True)

<matplotlib.axes._subplots.AxesSubplot at 0xa9403e6c>

In [7]:
#find which variable(s) has the most missing value 
#focus['abuse'] was determined to not have enough date points to be useful
focus.isnull().sum().sort_values()

life            246
gdp             301
teen            345
healthspend     422
matdeath        510
eduspend       1845
bc             3306
dtype: int64

In [8]:
#To find which variables will result in the highest number of instances when paired with Contraceptive use, do a for loop comparing each variable, one at a time to BC, doing dropna and see what df has the most rows.
dv = ['life', 'matdeath', 'teen', 'gdp', 'healthspend', 'eduspend']
for var in dv:
    compare = pd.DataFrame()
    compare['bc'] = focus['bc']
    compare[var]= focus[var]
    compare = compare.dropna()
    print compare.count()

bc      629
life    629
dtype: int64
bc          624
matdeath    624
dtype: int64
bc      628
teen    628
dtype: int64
bc     623
gdp    623
dtype: int64
bc             622
healthspend    622
dtype: int64
bc          373
eduspend    373
dtype: int64


In [9]:
#change scale of data to range of 0 to 1
focdrop= focus.copy().dropna()
scaler = pp.MinMaxScaler()
scale = pd.DataFrame(scaler.fit_transform(focdrop), columns= focdrop.columns, index = focdrop.index)
#Set up IV matrix and DV array to test model accuracy
dv = ['bc', 'life', 'matdeath', 'teen']
for var in dv:
    print(var)
    y = pd.DataFrame()
    y = scale[var]
    X = scale.copy()
    del X[var]
    print(X.head())

sgdr = lm.SGDRegressor(n_iter=1000)  # or any sklearn regressor that you like
sgdr.fit(X,y)
sgdr.score(X, y)

bc
        life  matdeath      teen       gdp  healthspend  eduspend
10  0.467943  0.219494  0.450212  0.004822     0.214419  0.276804
11  0.476183  0.201360  0.422924  0.005378     0.142159  0.241864
15  0.820981  0.015111  0.079746  0.011247     0.158779  0.175740
17  0.841356  0.012845  0.079571  0.014194     0.158584  0.156729
20  0.862745  0.010200  0.077814  0.027509     0.200539  0.171761
life
          bc  matdeath      teen       gdp  healthspend  eduspend
10  0.190840  0.219494  0.450212  0.004822     0.214419  0.276804
11  0.184297  0.201360  0.422924  0.005378     0.142159  0.241864
15  0.580153  0.015111  0.079746  0.011247     0.158779  0.175740
17  0.772083  0.012845  0.079571  0.014194     0.158584  0.156729
20  0.608506  0.010200  0.077814  0.027509     0.200539  0.171761
matdeath
        life        bc      teen       gdp  healthspend  eduspend
10  0.467943  0.190840  0.450212  0.004822     0.214419  0.276804
11  0.476183  0.184297  0.422924  0.005378     0.142159  0.

0.61232583963898635

In [10]:
sgdr.coef_

array([-0.29917306, -0.09514784,  0.50924228, -0.17920114,  0.08222745,
       -0.13432083])

In [11]:
X.shape

(368, 6)

In [12]:
#Create small data frame with only Contraceptive Use ('bc') and Life Expectancy ('life'), drop Nan values and run an SGD Regressor model on the two variables
bclife = pd.DataFrame()
bclife['bc'] = focus['bc']
bclife['life'] = focus['life']
bclife = bclife.dropna()
blmodel = lm.SGDRegressor(n_iter=100000, eta0=.001)
blmodel.fit(bclife[['bc']], bclife.life)
blmodel.score(bclife[['bc']], bclife.life)

0.55825459297696911

### There is a correlation coefficient of 0.558 between Contraceptive Use and Life Expectancy.