# imports and db connection setup

In [None]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [None]:
# get credentials from environment variables
user = os.getenv('PGUSER')
password = os.getenv('PGPASSWORD')
host = os.getenv('PGHOST')
port = os.getenv('PGPORT')
database = os.getenv('PGDATABASE')

# configure connection to postgres
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(user, password, host, port, database))

# open a connect
db_conn = engine.connect()

# get data from db

In [None]:
df = pd.read_sql("select * from modelling.acdhs_program_participation_and_evictions2;", db_conn)

In [None]:
df.shape

# predict homelessness

features to exlude:
- all starting with "nr_of_months_in_program_"
- client_hash
- hashed_mci_uniq_id
- padhs_client_hash
- date_of_last_eviction
- load_date
- city
- state

labels:
- is_currently_homeless
- is_currently_in_housing_support_program

In [None]:
features = [col for col in df.columns if col not in ["client_hash", "hashed_mci_uniq_id", "padhs_client_hash", "date_of_last_eviction", "load_date", "is_currently_homeless", "is_currently_in_housing_support_program", "city", "state", "nr_of_months_in_housing_support_programs"] and "nr_of_months_in_program_" not in col]
label = "is_currently_homeless"
df['gender_cd'] = df['gender_cd'].replace(['M','F',None],[0,1,2])
df = df.fillna(-1)



In [None]:
X = df[features]
y = df[label]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
print('Score:', clf.score(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
prediction = clf.predict_proba(X_test)[:,1]

In [None]:
plt.figure(figsize=(15,7))
plt.hist(prediction, bins=100)
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [None]:
n = int(len(prediction) * 0.01)
#A = np.array((1,2,4,3,10,0))
idx = np.argpartition(prediction, -n) # get indices of n highest scores
#prediction[idx[-n:]] # print n highest scores

In [None]:
threshold = 0.25
selections = prediction > threshold
print("From the", sum(selections), "highest ranked individuals,", sum(y_test[selections]), "ended up in the homelessness system within 6 months.")
print("Precision =", sum(y_test[selections]) / sum(selections))

In [None]:
for (x,y) in zip(X.columns, clf.coef_[0]):
    print(x, ":", y)

In [None]:
clf.coef_