In [1]:
import pandas as pd
import datetime as dt

In [2]:
demo = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-arquinter/master/project_3/data/CLIENT_191102.tsv", sep="\t")

In [3]:
for column in demo[["Client Primary Race", "Client Ethnicity", "Client Veteran Status"]]:
    demo[column] = demo[column].astype(str).str.rstrip(" (HUD)")

In [4]:
dis_entry = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-arquinter/master/project_3/data/DISABILITY_ENTRY_191102.tsv", sep = "\t")[["Client ID", "Disability Determination (Entry)", "Disability Type (Entry)"]]

In [5]:
dis_entry2 = dis_entry[dis_entry["Disability Determination (Entry)"] == "Yes (HUD)"].drop_duplicates()

In [6]:
for column in dis_entry2[["Disability Determination (Entry)", "Disability Type (Entry)"]]:
    dis_entry2[column] = dis_entry2[column].map(lambda x: x.rstrip(" (HUD)"))

In [7]:
dis_entry3 = pd.pivot(dis_entry2, index = "Client ID", columns = "Disability Type (Entry)", values = "Disability Determination (Entry)").fillna("No")

In [8]:
dis_entry3.columns = [str(x) + '_Entry' if ("ID" not in x or "AIDS" in x) else x for x in dis_entry3.columns]

In [9]:
dis_entry_fin = demo.merge(dis_entry3, on = "Client ID", how = "left")

In [10]:
dis_entry_fin[["Alcohol Abuse_Entry", "Both Alcohol and Drug Abuse_Entry", "Chronic Health Condition_Entry", "Developmental_Entry",
             "Drug Abuse_Entry", "HIV/AIDS_Entry", "Mental Health Problem_Entry", "Other_Entry", "Other: Learning_Entry", "Physical_Entry",
             "Physical/Medical_Entry", "Vision Impaired_Entry"]] = dis_entry_fin[["Alcohol Abuse_Entry", "Both Alcohol and Drug Abuse_Entry",
             "Chronic Health Condition_Entry", "Developmental_Entry", "Drug Abuse_Entry", "HIV/AIDS_Entry", "Mental Health Problem_Entry", "Other_Entry",
             "Other: Learning_Entry", "Physical_Entry", "Physical/Medical_Entry", "Vision Impaired_Entry"]].fillna("No")

In [11]:
for column in dis_entry_fin[["Alcohol Abuse_Entry", "Both Alcohol and Drug Abuse_Entry", "Chronic Health Condition_Entry", "Developmental_Entry",
             "Drug Abuse_Entry", "HIV/AIDS_Entry", "Mental Health Problem_Entry", "Other_Entry", "Other: Learning_Entry", "Physical_Entry",
             "Physical/Medical_Entry", "Vision Impaired_Entry"]]:
    dis_entry_fin.loc[(dis_entry_fin[column] == "Yes"), 'Disability_Entry'] = "Yes"
dis_entry_fin['Disability_Entry'] = dis_entry_fin['Disability_Entry'].fillna("No")

In [12]:
dis_exit = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-arquinter/master/project_3/data/DISABILITY_EXIT_191102.tsv", sep = "\t")[["Client ID", "Disability Determination (Exit)", "Disability Type (Exit)"]]

In [13]:
dis_exit2 = dis_exit[dis_exit["Disability Determination (Exit)"] == "Yes (HUD)"].drop_duplicates()

In [14]:
for column in dis_exit2[["Disability Determination (Exit)", "Disability Type (Exit)"]]:
    dis_exit2[column] = dis_exit2[column].map(lambda x: x.rstrip(" (HUD)"))

In [15]:
dis_exit3 = pd.pivot(dis_exit2, index = "Client ID", columns = "Disability Type (Exit)", values = "Disability Determination (Exit)").fillna("No")

In [16]:
dis_exit3.columns = [str(x) + '_Exit' if ("ID" not in x or "AIDS" in x) else x for x in dis_exit3.columns]

In [17]:
dis_exit_fin = demo.merge(dis_exit3, on = "Client ID", how = "left")

In [18]:
dis_exit_fin[["Alcohol Abuse_Exit", "Both Alcohol and Drug Abuse_Exit", "Chronic Health Condition_Exit", "Developmental_Exit",
             "Drug Abuse_Exit", "HIV/AIDS_Exit", "Mental Health Problem_Exit", "Other_Exit", "Other: Learning_Exit", "Physical_Exit",
             "Physical/Medical_Exit", "Vision Impaired_Exit"]] = dis_exit_fin[["Alcohol Abuse_Exit", "Both Alcohol and Drug Abuse_Exit",
             "Chronic Health Condition_Exit", "Developmental_Exit", "Drug Abuse_Exit", "HIV/AIDS_Exit", "Mental Health Problem_Exit", "Other_Exit",
             "Other: Learning_Exit", "Physical_Exit", "Physical/Medical_Exit", "Vision Impaired_Exit"]].fillna("No")

In [19]:
dis_exit_fin.drop(dis_exit_fin.columns[[4, 5, 6, 7, 8, 9]], axis = 1, inplace = True)

In [20]:
for column in dis_exit_fin[["Alcohol Abuse_Exit", "Both Alcohol and Drug Abuse_Exit", "Chronic Health Condition_Exit", "Developmental_Exit",
             "Drug Abuse_Exit", "HIV/AIDS_Exit", "Mental Health Problem_Exit", "Other_Exit", "Other: Learning_Exit", "Physical_Exit",
             "Physical/Medical_Exit", "Vision Impaired_Exit"]]:
    dis_exit_fin.loc[(dis_exit_fin[column] == "Yes"), 'Disability_Exit'] = "Yes"
dis_exit_fin['Disability_Exit'] = dis_exit_fin['Disability_Exit'].fillna("No")

In [21]:
disability_data = dis_entry_fin.merge(dis_exit_fin, on = ["Client ID", "EE Provider ID", "EE UID", "Client Unique ID"], how = "inner")

In [22]:
disability_data["Client Gender"] = disability_data["Client Gender"].fillna("Missing")
disability_data["Client Primary Race"] = disability_data["Client Primary Race"].replace({"nan": "Missing"})

In [23]:
disability_data.to_csv("./disability_data.csv")

In [24]:
ee_dates = pd.read_csv("https://raw.githubusercontent.com/datasci611/bios611-projects-fall-2019-arquinter/master/project_3/data/ENTRY_EXIT_191102.tsv", sep="\t")

In [25]:
ee_dates = ee_dates.dropna(subset=["Exit Date"])

In [26]:
ee_dates["entry_num"] = ee_dates["Entry Date"].map(lambda x: dt.datetime.strptime(x, "%m/%d/%Y"))

In [27]:
ee_dates["exit_num"] = ee_dates["Exit Date"].map(lambda x: dt.datetime.strptime(x, "%m/%d/%Y"))

In [28]:
ee_dates["dur"] = ee_dates["exit_num"] - ee_dates["entry_num"]

In [29]:
duration_data = ee_dates.merge(demo, on = ['Client ID', 'EE Provider ID', 'EE UID', 'Client Unique ID'], how = 'left')

In [30]:
duration_data = duration_data.drop("Unnamed: 6", axis = 1)

In [31]:
duration_data["dur"] = duration_data["dur"].dt.days

In [32]:
duration_data.to_csv("./duration_data.csv")