In [1]:
# packages for data processing
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt

# packages for linear model
import statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
# convert date into datetime type
def parse_date(string):
    month, day, year = string.split('/')
    return datetime(int(year), int(month), int(day))

# extract number of days from datetime
def stay(time):
    return time.days

def remove_HUD(string):
    return string[:-6]

In [3]:
# read and select data with parameters of interest
client_background = ["Client ID", "Client Primary Race", "Client Ethnicity", "Client Veteran Status"]
duration_interest = ['Client ID', 'Entry Date', 'Exit Date']
client = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/CLIENT_191102.tsv", sep="\t")[client_background]
duration = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/ENTRY_EXIT_191102.tsv", sep="\t")[duration_interest]

In [4]:
# join two datasets by client ID
data = pd.concat([duration, client], axis=1, join="inner")

# replace cells with NAN
data.replace('Data not collected (HUD)', np.nan, inplace=True)
data.replace("Client doesn't know (HUD)", np.nan, inplace=True)
data.replace('Client refused (HUD)', np.nan, inplace=True)

# remove missing data
data.dropna(inplace=True)

# calculate duration of people
data["Entry Date"] = data["Entry Date"].apply(parse_date)
data["Exit Date"] = data["Exit Date"].apply(parse_date)
data["duration"] = (data["Exit Date"] - data["Entry Date"]).apply(stay)

# rename some column names for later analysis
data.rename(columns={"Client Primary Race":"ClientPrimaryRace", "Client Ethnicity":"ClientEthnicity", "Client Veteran Status":"ClientVeteranStatus"}, inplace=True)

# remove HUD tail
data["ClientPrimaryRace"]=data["ClientPrimaryRace"].apply(remove_HUD)
data["ClientEthnicity"]=data["ClientEthnicity"].apply(remove_HUD)
data["ClientVeteranStatus"]=data["ClientVeteranStatus"].apply(remove_HUD)

In [5]:
# save data
data.to_csv("/Users/jianqiaowang/Documents/GitHub/bios611-projects-fall-2019-Jianqiao-Wang/project_3/scripts/duration.csv")

In [6]:
# linear model to test effect of different covariates
model = smf.ols(formula='duration~ClientEthnicity+ClientPrimaryRace+ClientVeteranStatus', 
                data=data,).fit()
model.summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.516
Date:,"Sun, 17 Nov 2019",Prob (F-statistic):,0.168
Time:,15:55:15,Log-Likelihood:,-28595.0
No. Observations:,5141,AIC:,57200.0
Df Residuals:,5134,BIC:,57250.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,30.0530,8.433,3.564,0.000,13.522,46.585
ClientEthnicity[T.Non-Hispanic/Non-Latino],11.6951,4.999,2.340,0.019,1.896,21.494
ClientPrimaryRace[T.Asian],15.5852,37.046,0.421,0.674,-57.041,88.211
ClientPrimaryRace[T.Black or African American],-0.8783,6.958,-0.126,0.900,-14.518,12.762
ClientPrimaryRace[T.Native Hawaiian or Other Pacific Islander],-10.8110,21.114,-0.512,0.609,-52.203,30.581
ClientPrimaryRace[T.White],-2.9546,7.116,-0.415,0.678,-16.905,10.996
ClientVeteranStatus[T.Yes],-2.4906,2.850,-0.874,0.382,-8.078,3.097

0,1,2,3
Omnibus:,3885.753,Durbin-Watson:,1.628
Prob(Omnibus):,0.0,Jarque-Bera (JB):,82685.428
Skew:,3.472,Prob(JB):,0.0
Kurtosis:,21.379,Cond. No.,68.0
