In [1]:
# packages for data processing
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt

# packages for linear model
import statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
# convert date into datetime type
def parse_date(string):
    month, day, year = string.split('/')
    return datetime(int(year), int(month), int(day))

# extract number of days from datetime
def stay(time):
    return time.days

def remove_HUD(string):
    return string[:-6]

In [3]:
# read and select data with parameters of interest
client_background = ["Client ID", "Client Age at Entry", "Client Gender", "Client Primary Race", "Client Ethnicity", "Client Veteran Status"]
duration_interest = ['Client ID', 'Entry Date', 'Exit Date']
client = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/CLIENT_191102.tsv", sep="\t")[client_background]
duration = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/ENTRY_EXIT_191102.tsv", sep="\t")[duration_interest]

In [4]:
# join two datasets by client ID
data = pd.concat([duration, client], axis=1, join="inner")

# replace cells with NAN
data.replace('Data not collected (HUD)', np.nan, inplace=True)
data.replace("Client doesn't know (HUD)", np.nan, inplace=True)
data.replace('Client refused (HUD)', np.nan, inplace=True)

# remove missing data
data.dropna(inplace=True)

# calculate duration of people
data["Entry Date"] = data["Entry Date"].apply(parse_date)
data["Exit Date"] = data["Exit Date"].apply(parse_date)
data["duration"] = (data["Exit Date"] - data["Entry Date"]).apply(stay)
data = data.loc[data['duration']>0]

# rename some column names for later analysis
data.rename(columns={"Client Primary Race": "Race", 
                     "Client Ethnicity": "Ethnicity", 
                     "Client Veteran Status": "VeteranStatus", 
                     "Client Age at Entry": "Age", 
                     "Client Gender": "Gender"}, inplace=True)

# remove HUD tail
data["Race"]=data["Race"].apply(remove_HUD)
data["Ethnicity"]=data["Ethnicity"].apply(remove_HUD)
data["VeteranStatus"]=data["VeteranStatus"].apply(remove_HUD)

In [5]:
# save data
data.to_csv("duration.csv")

In [6]:
# gamma regression to test effect of different covariates
model = smf.glm(formula='duration~Age+Gender+Ethnicity+Race+VeteranStatus', 
                family=sm.families.Gamma(), data=data,).fit()
model.summary()



0,1,2,3
Dep. Variable:,duration,No. Observations:,5125.0
Model:,GLM,Df Residuals:,5115.0
Model Family:,Gamma,Df Model:,9.0
Link Function:,inverse_power,Scale:,2.4269
Method:,IRLS,Log-Likelihood:,-23873.0
Date:,"Mon, 18 Nov 2019",Deviance:,9584.0
Time:,00:33:12,Pearson chi2:,12400.0
No. Iterations:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0474,0.006,7.492,0.000,0.035,0.060
Gender[T.Male],0.0034,0.001,2.843,0.004,0.001,0.006
Gender[T.Trans Female (MTF or Male to Female)],0.0042,0.012,0.355,0.723,-0.019,0.028
Ethnicity[T.Non-Hispanic/Non-Latino],-0.0099,0.004,-2.240,0.025,-0.018,-0.001
Race[T.Asian],-0.0128,0.016,-0.793,0.428,-0.044,0.019
Race[T.Black or African American],0.0007,0.004,0.177,0.859,-0.007,0.009
Race[T.Native Hawaiian or Other Pacific Islander],0.0058,0.018,0.322,0.748,-0.029,0.041
Race[T.White],0.0027,0.004,0.651,0.515,-0.005,0.011
VeteranStatus[T.Yes],0.0041,0.002,2.181,0.029,0.000,0.008
