# Variable Correlations
### Assess correlation coefficients between variables of interest in order to look at relationships and to identify multicollinearity

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy
df = pd.read_csv('~/Documents/Repository/Capstone-1_WorldBank_GenderData/wrangled_data.csv')

In [2]:
#create a new df with smaller number of variables to focus on and run correlations on
focus = pd.DataFrame()
#focus['country'] = df['Country.Name']
focus['life'] = df['Life expectancy at birth, total (years)']
focus['bc'] = df['Contraceptive prevalence, any methods (% of women ages 15-49)']
focus['matdeath'] = df['Maternal mortality ratio (modeled estimate, per 100,000 live births)']
focus['teen'] = df['Adolescent fertility rate (births per 1,000 women ages 15-19)']

focus['gdp'] = df['GDP per capita (Current US$)']
focus['healthspend'] = df['Health expenditure, public (% of GDP)']
focus['wifehealth'] = df['Decision maker about a woman\'s own health care: mainly wife (% of women age 15-49)']
focus['hushealth'] = df['Decision maker about a woman\'s own health care: mainly husband (% of women age 15-49)']
focus['bothhealth'] = df['Decision maker about a woman\'s own health care: wife and husband jointly (% of women age 15-49)']

focus['eduspend'] = df['Public spending on education, total (% of GDP)']
focus['noschool'] = df['Educational attainment, no schooling, population 25+ years, total (%)']
focus['malenoschool'] = df['Educational attainment, no schooling, population 25+ years, male (%)']
focus['malesomeschool'] = df['Educational attainment, at least competed lower secondary, population 25+, male (%) (cumulative)']
focus['malelit'] = df['Literacy rate, adult male (% of males ages 15 and above)']
focus['femalelit'] = df['Literacy rate, adult female (% of females ages 15 and above)']
focus['lit'] = df['Literacy rate, youth (ages 15-24), gender parity index (GPI)']
focus['bach'] = df['Educational attainment, completed Bachelor\'s or equivalent, population 25+ years, total (%)']
focus['fembach'] = df['Educational attainment, completed Bachelor\'s or equivalent, population 25+ years, female (%)']

focus['abuse'] = df['Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of women age 15-49)']
focus['legis'] = df['Female legislators, senior officials and managers (% of total)']

#focus[''] = df['']

focus.to_csv('focus.csv')

# None of the variables are normally distributed:

In [4]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['wifehealth'].dropna(), axis=0)

NormaltestResult(statistic=11.967032622933143, pvalue=0.0025199497668011049)

In [5]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['hushealth'].dropna(), axis=0)

NormaltestResult(statistic=12.613914561600758, pvalue=0.0018235734281981482)

In [6]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['bothhealth'].dropna(), axis=0)

NormaltestResult(statistic=8.4459857845394009, pvalue=0.014654718810611766)

In [7]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['eduspend'].dropna(), axis=0)

NormaltestResult(statistic=593.00099930141528, pvalue=1.7039981547686065e-129)

In [8]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['noschool'].dropna(), axis=0)

NormaltestResult(statistic=352.6668495824452, pvalue=2.6264348843750046e-77)

In [9]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['malenoschool'].dropna(), axis=0)

NormaltestResult(statistic=408.35078197745082, pvalue=2.1269356624700409e-89)

In [10]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['malesomeschool'].dropna(), axis=0)

NormaltestResult(statistic=57.465734343876555, pvalue=3.3225715188416346e-13)

In [11]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['malelit'].dropna(), axis=0)

NormaltestResult(statistic=171.03370355647277, pvalue=7.2527576026695477e-38)

In [12]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['femalelit'].dropna(), axis=0)

NormaltestResult(statistic=88.617408087560293, pvalue=5.7144486368490824e-20)

In [13]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['bach'].dropna(), axis=0)

NormaltestResult(statistic=31.431876974833109, pvalue=1.4950462809815258e-07)

In [14]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['fembach'].dropna(), axis=0)

NormaltestResult(statistic=37.365061806516977, pvalue=7.6962633641236659e-09)

In [15]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['abuse'].dropna(), axis=0)

NormaltestResult(statistic=14.264747119661239, pvalue=0.0007988210858759818)

In [16]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['legis'].dropna(), axis=0)

NormaltestResult(statistic=23.261710691847409, pvalue=8.8875836931924428e-06)

In [17]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['gdp'].dropna(), axis=0)

NormaltestResult(statistic=2460.338407499014, pvalue=0.0)

In [18]:
#test to verify that the variable distribution is normal
scipy.stats.normaltest(focus['healthspend'].dropna(), axis=0)

NormaltestResult(statistic=1441.4773089236526, pvalue=0.0)

## All tests and models need to be non-parametric, because all variables are non-normal

### The Spearmen's R test was chosen to calculate correlation coefficients :

In [3]:
bcmat = stats.spearmanr(focus['matdeath'], focus['bc'], nan_policy='omit') 
bcmat

SpearmanrResult(correlation=-0.7246482405235215, pvalue=masked_array(data = 1.15638871362e-102,
             mask = False,
       fill_value = 1e+20)
)

In [5]:
bcteen = stats.spearmanr(focus['teen'], focus['bc'], nan_policy='omit') 
bcteen

SpearmanrResult(correlation=-0.56159413145076997, pvalue=masked_array(data = 1.76484058854e-53,
             mask = False,
       fill_value = 1e+20)
)

In [6]:
bclife = stats.spearmanr(focus['life'], focus['bc'], nan_policy='omit') 
bclife

SpearmanrResult(correlation=0.77216872977457462, pvalue=masked_array(data = 1.35854238789e-125,
             mask = False,
       fill_value = 1e+20)
)

In [6]:
teenmat = stats.spearmanr(focus['matdeath'], focus['teen'], nan_policy='omit') 
teenmat

SpearmanrResult(correlation=-2.6860147068888858, pvalue=0.0)

### A correlation of -2.686 doesn't seem legitimate, but it's unclear what's happening here.

In [3]:
eduteen = stats.spearmanr(focus['teen'], focus['eduspend'], nan_policy='omit') 
eduteen

  denom = n*(n**2 - 1)/6.


SpearmanrResult(correlation=-297.18655014305057, pvalue=0.0)

In [10]:
gdpbc = stats.spearmanr(focus['bc'], focus['gdp'], nan_policy='omit')
gdpbc

SpearmanrResult(correlation=0.6100584199391057, pvalue=masked_array(data = 8.87905470481e-65,
             mask = False,
       fill_value = 1e+20)
)

In [33]:
# Wife as the decision maker of her healthcare vs husband as the decision maker
wifehushealth = stats.spearmanr(focus['wifehealth'], focus['hushealth'], nan_policy='omit')
wifehushealth

SpearmanrResult(correlation=-0.75600542262969594, pvalue=masked_array(data = 3.92827944538e-25,
             mask = False,
       fill_value = 1e+20)
)

In [34]:
# Wife as the decision maker of her healthcare vs both wife and husband as the decision maker
wifebothhealth = stats.spearmanr(focus['wifehealth'], focus['bothhealth'], nan_policy='omit')
wifebothhealth

SpearmanrResult(correlation=0.020224452146690552, pvalue=masked_array(data = 0.820040357358,
             mask = False,
       fill_value = 1e+20)
)

In [32]:
# Wife as the decision maker of her healthcare vs both wife and husband as the decision maker
husbothhealth = stats.spearmanr(focus['hushealth'], focus['bothhealth'], nan_policy='omit')
print(husbothhealth)

SpearmanrResult(correlation=-0.53140652951699474, pvalue=masked_array(data = 9.21271543315e-11,
             mask = False,
       fill_value = 1e+20)
)


In [35]:
# Husband as Decision Maker vs Contraceptive use
bchushealth = stats.spearmanr(focus['hushealth'], focus['bc'], nan_policy='omit')
bchushealth

SpearmanrResult(correlation=-0.68294666815742389, pvalue=masked_array(data = 4.87972567157e-19,
             mask = False,
       fill_value = 1e+20)
)

In [36]:
# Wife as Decision Maker vs Contraceptive use
bcwifehealth = stats.spearmanr(focus['wifehealth'], focus['bc'], nan_policy='omit')
bcwifehealth

SpearmanrResult(correlation=0.61782340116279078, pvalue=masked_array(data = 6.22185455129e-15,
             mask = False,
       fill_value = 1e+20)
)

In [37]:
# Husband as decision maker vs Maternal Death Rate
husmatdeath = stats.spearmanr(focus['hushealth'], focus['matdeath'], nan_policy='omit')
husmatdeath

SpearmanrResult(correlation=0.65024178220035778, pvalue=masked_array(data = 7.50172261073e-17,
             mask = False,
       fill_value = 1e+20)
)

In [38]:
# Wife as decision maker vs Maternal Death Rate
wifematdeath = stats.spearmanr(focus['wifehealth'], focus['matdeath'], nan_policy='omit')
wifematdeath 

SpearmanrResult(correlation=-0.47298747763864046, pvalue=masked_array(data = 1.51219600627e-08,
             mask = False,
       fill_value = 1e+20)
)

In [16]:
healbc = stats.spearmanr(focus['healthspend'], focus['bc'], nan_policy='omit')
healbc

SpearmanrResult(correlation=0.34695444079893445, pvalue=masked_array(data = 4.92284185255e-19,
             mask = False,
       fill_value = 1e+20)
)

# Education variables

In [13]:
fbachbc = stats.spearmanr(focus['fembach'], focus['bc'], nan_policy='omit')
fbachbc

SpearmanrResult(correlation=0.46737868872700339, pvalue=masked_array(data = 3.41370170327e-06,
             mask = False,
       fill_value = 1e+20)
)

In [14]:
fbachgdp = stats.spearmanr(focus['fembach'], focus['gdp'], nan_policy='omit')
fbachgdp

SpearmanrResult(correlation=0.66777068891136837, pvalue=masked_array(data = 7.23523392628e-80,
             mask = False,
       fill_value = 1e+20)
)

In [8]:
edubc = stats.spearmanr(focus['eduspend'], focus['bc'], nan_policy='omit')
edubc

SpearmanrResult(correlation=0.29082581079887393, pvalue=masked_array(data = 1.05230277079e-08,
             mask = False,
       fill_value = 1e+20)
)

In [39]:
#Education Spending vs Rates of No Schooling
edunoschool = stats.spearmanr(focus['eduspend'], focus['noschool'], nan_policy='omit')
edunoschool

SpearmanrResult(correlation=-0.24403943020244134, pvalue=masked_array(data = 4.23971463731e-06,
             mask = False,
       fill_value = 1e+20)
)

^ Small neg correlation between spending and rate of no education

In [40]:
#No School vs Males with No school
no = stats.spearmanr(focus['malenoschool'], focus['noschool'], nan_policy='omit')
no

SpearmanrResult(correlation=0.9774866688931052, pvalue=masked_array(data = 0.0,
             mask = False,
       fill_value = 1e+20)
)

### Pick either 'no school' or 'male no school' because they are collinear

In [41]:
#Males with No Schooling vs Education Spending
nomale = stats.spearmanr(focus['malenoschool'], focus['eduspend'], nan_policy='omit')
nomale

SpearmanrResult(correlation=-0.21839384421497177, pvalue=masked_array(data = 4.07502908426e-05,
             mask = False,
       fill_value = 1e+20)
)

In [42]:
noabuse = stats.spearmanr(focus['abuse'], focus['noschool'], nan_policy='omit')
noabuse 

SpearmanrResult(correlation=0.74725274725274726, pvalue=masked_array(data = 7.52021473844e-06,
             mask = False,
       fill_value = 1e+20)
)

In [43]:
noabuse = stats.spearmanr(focus['abuse'], focus['malenoschool'], nan_policy='omit')
noabuse 

SpearmanrResult(correlation=0.69719169719169716, pvalue=masked_array(data = 5.31890583684e-05,
             mask = False,
       fill_value = 1e+20)
)

### Drop 'MaleNoSchool' as a variable, because it is less correlated to abuse than overall no schooling

In [61]:
maleabuse = stats.spearmanr(focus['malesomeschool'], focus['abuse'], nan_policy='omit') 
maleabuse

SpearmanrResult(correlation=-0.62696267696267705, pvalue=masked_array(data = 4.28485818031e-05,
             mask = False,
       fill_value = 1e+20)
)

### And Male Some Schooling also seems redundant

In [44]:
lit = stats.spearmanr(focus['femalelit'], focus['malelit'], nan_policy='omit') 
lit

SpearmanrResult(correlation=0.9493587433858427, pvalue=masked_array(data = 2.83407233543e-288,
             mask = False,
       fill_value = 1e+20)
)

### Female and male Lit are highly correlated, so maybe just look at overall lit?

In [45]:
litbc = stats.spearmanr(focus['lit'], focus['bc'], nan_policy='omit') 
litbc

SpearmanrResult(correlation=0.63988673063639934, pvalue=masked_array(data = 3.84576173811e-17,
             mask = False,
       fill_value = 1e+20)
)

### Fairly high correlation between Literacy and Contracptive Use

In [11]:
mlitbc = stats.spearmanr(focus['malelit'], focus['bc'], nan_policy='omit') 
mlitbc

SpearmanrResult(correlation=0.71221989663951901, pvalue=masked_array(data = 2.65232522327e-27,
             mask = False,
       fill_value = 1e+20)
)

In [12]:
flitbc = stats.spearmanr(focus['femalelit'], focus['bc'], nan_policy='omit') 
flitbc

SpearmanrResult(correlation=0.71284248915524828, pvalue=masked_array(data = 2.28187680063e-27,
             mask = False,
       fill_value = 1e+20)
)

In [46]:
litmat = stats.spearmanr(focus['lit'], focus['matdeath'], nan_policy='omit') 
litmat

SpearmanrResult(correlation=-0.48183953559979376, pvalue=masked_array(data = 3.25395966694e-29,
             mask = False,
       fill_value = 1e+20)
)

### Mild negative correlation between Literacy and maternal death

In [47]:
bd = stats.spearmanr(focus['bach'], focus['fembach'], nan_policy='omit') 
bd 

SpearmanrResult(correlation=0.97338058021505169, pvalue=masked_array(data = 0.0,
             mask = False,
       fill_value = 1e+20)
)

### General Bachelor's degree achievement and Female Bachelor's degree achievement are collinear, so one must be dropped.

In [48]:
bbc = stats.spearmanr(focus['bach'], focus['bc'], nan_policy='omit') 
bbc 

SpearmanrResult(correlation=0.43880726015557481, pvalue=masked_array(data = 1.51672830316e-05,
             mask = False,
       fill_value = 1e+20)
)

In [49]:
fbbc = stats.spearmanr(focus['fembach'], focus['bc'], nan_policy='omit') 
fbbc 

SpearmanrResult(correlation=0.46737868872700339, pvalue=masked_array(data = 3.41370170327e-06,
             mask = False,
       fill_value = 1e+20)
)

### Female Bachelor's degree achievement is slighlty higher correlated with Contraceptive use than General Bachelor's degree achievement

In [50]:
edu = stats.spearmanr(focus['lit'], focus['bach'], nan_policy='omit') 
edu

SpearmanrResult(correlation=0.20364782193396536, pvalue=masked_array(data = 0.000894519198162,
             mask = False,
       fill_value = 1e+20)
)

### Literacy and Bachelors degree earning are NOT collinear

In [53]:
sch = stats.spearmanr(focus['noschool'], focus['bach'], nan_policy='omit') 
sch 

SpearmanrResult(correlation=-0.49122099054249357, pvalue=masked_array(data = 1.60301439398e-30,
             mask = False,
       fill_value = 1e+20)
)

### No School and Bachelor's degree achievement are somewhat negatively correlated (-0.49). Are they collinear?

In [54]:
nolit = stats.spearmanr(focus['lit'], focus['noschool'], nan_policy='omit') 
nolit 

SpearmanrResult(correlation=-0.34426381105624304, pvalue=masked_array(data = 6.70087713131e-09,
             mask = False,
       fill_value = 1e+20)
)

### Is -0.34 collinear?

### Include Bachelor degree achievement?

In [55]:
bachspend = stats.spearmanr(focus['bach'], focus['eduspend'], nan_policy='omit') 
bachspend 

SpearmanrResult(correlation=0.33440006567498548, pvalue=masked_array(data = 2.513857649e-12,
             mask = False,
       fill_value = 1e+20)
)

In [56]:
nobach = stats.spearmanr(focus['bach'], focus['noschool'], nan_policy='omit') 
nobach 

SpearmanrResult(correlation=-0.49122099054249357, pvalue=masked_array(data = 1.60301439398e-30,
             mask = False,
       fill_value = 1e+20)
)

In [57]:
bcbach = stats.spearmanr(focus['bach'], focus['bc'], nan_policy='omit') 
bcbach 

SpearmanrResult(correlation=0.43880726015557481, pvalue=masked_array(data = 1.51672830316e-05,
             mask = False,
       fill_value = 1e+20)
)

## ?

In [58]:
bachlegis = stats.spearmanr(focus['bach'], focus['legis'], nan_policy='omit') 
bachlegis 

SpearmanrResult(correlation=0.20309098273074699, pvalue=masked_array(data = 0.000213272738578,
             mask = False,
       fill_value = 1e+20)
)

## Bachelor degree achievement is not highly correlated with Female legislators, managers etc

In [60]:
bcabuse = stats.spearmanr(focus['bc'], focus['abuse'], nan_policy='omit') 
bcabuse

SpearmanrResult(correlation=-0.46242784992785002, pvalue=masked_array(data = 0.000378756351137,
             mask = False,
       fill_value = 1e+20)
)