In [7]:
import pandas as pd
import pickle
import os
import re
import scipy.stats as st
import numpy as np
from matplotlib import pyplot as plt  
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [8]:
#Load data into a dataframe
df = pd.read_csv('adult19.csv')
df.shape

(31997, 534)

In [9]:
#View starting Data
df.head()

Unnamed: 0,URBRRL,RATCAT_A,INCGRP_A,INCTCFLG_A,FAMINCTC_A,IMPINCFLG_A,PPSU,PSTRAT,HISPALLP_A,RACEALLP_A,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_QRT,RECTYPE,WTFA_A,WTIA_A,HHX,POVRATTC_A
0,4,9,3,0,60000,2,2,122,3,2,...,,,1,1,1,10,13177.008,7601.336,H048109,2.96
1,4,9,3,0,50000,0,2,122,2,1,...,,,1,1,1,10,6140.552,3344.434,H027044,2.97
2,4,12,3,0,65000,1,2,122,2,1,...,,,1,1,1,10,9191.061,6949.498,H058855,4.28
3,4,14,5,0,120000,0,2,122,1,8,...,,,1,1,1,10,7900.035,6446.327,H031993,7.13
4,1,4,1,0,30000,0,2,115,2,1,...,,,1,1,1,10,10875.772,8646.586,H007122,1.13


In [10]:
#Reduce the dataframe to what will be used
df = df[['DIFF_A', 'EDUC_A', 'FAMINCTC_A', 'PAIFRQ3M_A', 'DEPMED_A']].dropna()
df.shape

(31840, 5)

In [11]:
#Rename Colmns
df.rename({'DIFF_A': 'Diff_Walk', 'EDUC_A': 'Edu_Level','FAMINCTC_A': 'Fam_Income', 'PAIFRQ3M_A': 'Pain', 'DEPMED_A': 'Depress_Med'}, axis=1, inplace=True)
#View new dataframe
df.head()

Unnamed: 0,Diff_Walk,Edu_Level,Fam_Income,Pain,Depress_Med
0,1,7,60000,2.0,2
1,1,6,50000,1.0,2
2,1,5,65000,2.0,2
3,1,7,120000,2.0,2
4,2,5,30000,3.0,1


In [12]:
#Clean data

#DIFF_A: aka: Difficulty walking
#useful data were coded 1-4
#7 was used if refused, 8 Not Ascertained, and 9 Don't Know
df = df[df['Diff_Walk'] <= 4]

#EDUC_A: aka: Education level
#useful data were coded 1-11
#97 was used if refused, 98 Not Ascertained, and 99 Don't Know
df = df[df['Edu_Level'] <= 11]

#FAMINCTC_A: aka: Family Incomne
#All data ranges from 0-220000 
#Represents outcome in $
#220000 represents $220000 and up

#PAIFRQ3M_A: aka: How often had pain
#useful data were coded 1-4
#7 was used if refused, 8 Not Ascertained, and 9 Don't Know
df = df[df['Pain'] <= 4]

#DEPMED_A: aka: Do you take prescription medication for depression?
#useful data were coded 1-4
#7 was used if refused, 8 Not Ascertained, and 9 Don't Know
df = df[df['Depress_Med'] <= 2]

#Replace with 0 and 1 to represent True or False respectivly 
df['Depress_Med'] = (df['Depress_Med'] > 1).astype(int)




In [13]:
#view differnece
df.head()

Unnamed: 0,Diff_Walk,Edu_Level,Fam_Income,Pain,Depress_Med
0,1,7,60000,2.0,1
1,1,6,50000,1.0,1
2,1,5,65000,2.0,1
3,1,7,120000,2.0,1
4,2,5,30000,3.0,0


In [14]:
model_walk = smf.ols('Depressed ~ Diff_Walk', data=df).fit()
model_walk.summary()

PatsyError: Error evaluating factor: NameError: name 'Depressed' is not defined
    Depressed ~ Diff_Walk
    ^^^^^^^^^

In [None]:
model_edu = smf.ols('Depressed ~ Edu_Level', data=df).fit()
model_edu.summary()

0,1,2,3
Dep. Variable:,Depressed,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,380.6
Date:,"Wed, 13 Apr 2022",Prob (F-statistic):,1.28e-83
Time:,20:11:22,Log-Likelihood:,-19523.0
No. Observations:,13603,AIC:,39050.0
Df Residuals:,13601,BIC:,39060.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8914,0.022,129.667,0.000,2.848,2.935
Edu_Level,0.0690,0.004,19.509,0.000,0.062,0.076

0,1,2,3
Omnibus:,1865.247,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2756.808
Skew:,-1.101,Prob(JB):,0.0
Kurtosis:,2.894,Cond. No.,16.5


In [None]:
model_income = smf.ols('Depressed ~ Fam_Income', data=df).fit()
model_income.summary()

0,1,2,3
Dep. Variable:,Depressed,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.041
Method:,Least Squares,F-statistic:,584.1
Date:,"Wed, 13 Apr 2022",Prob (F-statistic):,2.1399999999999997e-126
Time:,20:11:22,Log-Likelihood:,-19424.0
No. Observations:,13603,AIC:,38850.0
Df Residuals:,13601,BIC:,38870.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0360,0.014,222.086,0.000,3.009,3.063
Fam_Income,3.857e-06,1.6e-07,24.169,0.000,3.54e-06,4.17e-06

0,1,2,3
Omnibus:,1793.714,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2600.355
Skew:,-1.068,Prob(JB):,0.0
Kurtosis:,2.838,Cond. No.,135000.0


In [None]:
model_pain = smf.ols('Depressed ~ Pain', data=df).fit()
model_pain.summary()

0,1,2,3
Dep. Variable:,Depressed,R-squared:,0.071
Model:,OLS,Adj. R-squared:,0.071
Method:,Least Squares,F-statistic:,1035.0
Date:,"Wed, 13 Apr 2022",Prob (F-statistic):,6.65e-219
Time:,20:11:22,Log-Likelihood:,-19212.0
No. Observations:,13603,AIC:,38430.0
Df Residuals:,13601,BIC:,38440.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.8740,0.020,193.703,0.000,3.835,3.913
Pain,-0.2585,0.008,-32.170,0.000,-0.274,-0.243

0,1,2,3
Omnibus:,1702.859,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2440.029
Skew:,-1.037,Prob(JB):,0.0
Kurtosis:,2.947,Cond. No.,6.64


In [None]:
#Combination
model_mix = smf.ols('Depressed ~ Diff_Walk + Edu_Level + Fam_Income + Pain', data=df).fit();
model_mix.summary()

0,1,2,3
Dep. Variable:,Depressed,R-squared:,0.109
Model:,OLS,Adj. R-squared:,0.109
Method:,Least Squares,F-statistic:,416.7
Date:,"Wed, 13 Apr 2022",Prob (F-statistic):,0.0
Time:,20:16:53,Log-Likelihood:,-18924.0
No. Observations:,13603,AIC:,37860.0
Df Residuals:,13598,BIC:,37900.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.6250,0.034,107.884,0.000,3.559,3.691
Diff_Walk,-0.1696,0.014,-12.389,0.000,-0.196,-0.143
Edu_Level,0.0270,0.004,7.189,0.000,0.020,0.034
Fam_Income,2.141e-06,1.72e-07,12.422,0.000,1.8e-06,2.48e-06
Pain,-0.1755,0.009,-19.694,0.000,-0.193,-0.158

0,1,2,3
Omnibus:,1503.081,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2064.988
Skew:,-0.954,Prob(JB):,0.0
Kurtosis:,2.92,Cond. No.,353000.0
