In [1]:
# packages for data processing
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt

# packages for linear model
import statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
# convert date into datetime type
def parse_date(string):
    month, day, year = string.split('/')
    return datetime(int(year), int(month), int(day))

# extract number of days from datetime
def stay(time):
    return time.days

def remove_HUD(string):
    return string[:-6]

In [3]:
# read and select data with parameters of interest
client_background = ["Client ID", "Client Age at Entry", "Client Gender", "Client Primary Race", "Client Ethnicity", "Client Veteran Status"]
duration_interest = ['Client ID', 'Entry Date', 'Exit Date']
client = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/CLIENT_191102.tsv", sep="\t")[client_background]
duration = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-Jianqiao-Wang/master/project_3/data/ENTRY_EXIT_191102.tsv", sep="\t")[duration_interest]

In [4]:
# join two datasets by client ID
data = pd.concat([duration, client], axis=1, join="inner")

# replace cells with NAN
data.replace('Data not collected (HUD)', np.nan, inplace=True)
data.replace("Client doesn't know (HUD)", np.nan, inplace=True)
data.replace('Client refused (HUD)', np.nan, inplace=True)

# remove missing data
data.dropna(inplace=True)

# calculate duration of people
data["Entry Date"] = data["Entry Date"].apply(parse_date)
data["Exit Date"] = data["Exit Date"].apply(parse_date)
data["duration"] = (data["Exit Date"] - data["Entry Date"]).apply(stay)

# rename some column names for later analysis
data.rename(columns={"Client Primary Race": "ClientPrimaryRace", 
                     "Client Ethnicity": "ClientEthnicity", 
                     "Client Veteran Status": "ClientVeteranStatus", 
                     "Client Age at Entry": "ClientAge", 
                     "Client Gender": "ClientGender"}, inplace=True)

# remove HUD tail
data["ClientPrimaryRace"]=data["ClientPrimaryRace"].apply(remove_HUD)
data["ClientEthnicity"]=data["ClientEthnicity"].apply(remove_HUD)
data["ClientVeteranStatus"]=data["ClientVeteranStatus"].apply(remove_HUD)

In [5]:
# save data
data.to_csv("/Users/jianqiaowang/Documents/GitHub/bios611-projects-fall-2019-Jianqiao-Wang/project_3/scripts/duration.csv")

In [7]:
# linear model to test effect of different covariates
model = smf.ols(formula='duration~ClientAge+ClientGender+ClientEthnicity+ClientPrimaryRace+ClientVeteranStatus', 
                data=data,).fit()
model.summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,9.257
Date:,"Sun, 17 Nov 2019",Prob (F-statistic):,4.69e-14
Time:,18:10:11,Log-Likelihood:,-28558.0
No. Observations:,5141,AIC:,57140.0
Df Residuals:,5131,BIC:,57200.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,11.2164,9.064,1.237,0.216,-6.554,28.986
ClientGender[T.Male],-6.1574,2.081,-2.958,0.003,-10.238,-2.077
ClientGender[T.Trans Female (MTF or Male to Female)],-6.5459,15.307,-0.428,0.669,-36.555,23.463
ClientEthnicity[T.Non-Hispanic/Non-Latino],10.8619,4.980,2.181,0.029,1.100,20.624
ClientPrimaryRace[T.Asian],24.6018,36.808,0.668,0.504,-47.557,96.761
ClientPrimaryRace[T.Black or African American],-1.5116,6.911,-0.219,0.827,-15.060,12.037
ClientPrimaryRace[T.Native Hawaiian or Other Pacific Islander],-6.2437,20.976,-0.298,0.766,-47.365,34.878
ClientPrimaryRace[T.White],-4.3655,7.071,-0.617,0.537,-18.228,9.497
ClientVeteranStatus[T.Yes],-7.0531,2.948,-2.392,0.017,-12.833,-1.273

0,1,2,3
Omnibus:,3855.419,Durbin-Watson:,1.637
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81654.516
Skew:,3.432,Prob(JB):,0.0
Kurtosis:,21.277,Cond. No.,1940.0
