In [1]:
import pandas as pd
from numpy import mean
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

import seaborn as sns

import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import time
import math
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import datasets, metrics, model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as rfc

In [2]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score
import numpy as np

#A list of keys for the dictionary returned by p1_metrics
metric_keys = ['auc','f1','accuracy','sensitivity','specificity', 'precision']

def p1_metrics(y_true,y_pred,include_cm=True):
    cm = confusion_matrix(y_true,y_pred)
    tn, fp, fn, tp = cm.ravel()
    if include_cm:
        return {
            'auc': roc_auc_score(y_true,y_pred),
            'f1': f1_score(y_true,y_pred),
            'accuracy': (tp+tn)/np.sum(cm),
            'sensitivity': tp/(tp+fn),
            'specificity': tn/(tn+fp),
            'precision': tp/(tp+fp),
            'confusion_matrix': cm}
    else:
        return {
            'auc': roc_auc_score(y_true,y_pred),
            'f1': f1_score(y_true,y_pred),
            'accuracy': (tp+tn)/np.sum(cm),
            'sensitivity': tp/(tp+fn),
            'specificity': tn/(tn+fp),
            'precision': tp/(tp+fp)}

In [3]:
census_2009 = pd.read_sas("../../../../CTP/spm_pu_2009.sas7bdat")
census_2009.head()

Unnamed: 0,filedate,serialno,sporder,st,puma,wt,age,sex,mar,education,...,spm_fedtax,spm_fedtaxbc,spm_eitc,spm_fica,spm_sttax,spm_capwkccxpns,spm_wkxpns,spm_childcarexpns,spm_medxpns,spm_premium
0,20211015.0,2.0,1.0,39.0,3901500.0,76.0,60.0,2.0,1.0,3.0,...,3405.0,4205.0,0.0,1984.0,1513.85,925.65,925.65,0.0,9780.04632,3585.154188
1,20211015.0,2.0,2.0,39.0,3901500.0,78.0,61.0,1.0,1.0,2.0,...,3405.0,4205.0,0.0,1984.0,1513.85,925.65,925.65,0.0,9780.04632,3585.154188
2,20211015.0,3.0,1.0,13.0,1304200.0,51.0,60.0,1.0,2.0,2.0,...,9562.3799,9962.3799,0.0,4092.0,3375.5,1220.175,1220.175,0.0,13400.0,10000.0
3,20211015.0,4.0,1.0,36.0,3600401.0,9.0,79.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,-172.7,0.0,0.0,0.0,4985.75,3235.75
4,20211015.0,4.0,2.0,36.0,3600401.0,10.0,75.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,-172.7,0.0,0.0,0.0,4985.75,3235.75


In [4]:
features = ['age', 'sex', 'mar', 'education', 'race', 'hispanic', 'offpoor', 'moop_other', 'agi', 'spm_numkids', 'spm_wcohabit', 'spm_totval', 'spm_capwkccxpns']
df = census_2009[features]
df.head()

Unnamed: 0,age,sex,mar,education,race,hispanic,offpoor,moop_other,agi,spm_numkids,spm_wcohabit,spm_totval,spm_capwkccxpns
0,60.0,2.0,1.0,3.0,1.0,0.0,0.0,2525.0,53300.0,0.0,0.0,53300.0,925.65
1,61.0,1.0,1.0,2.0,1.0,0.0,0.0,3669.892132,53300.0,0.0,0.0,53300.0,925.65
2,60.0,1.0,2.0,2.0,1.0,0.0,0.0,3400.0,66000.0,0.0,0.0,66000.0,1220.175
3,79.0,1.0,1.0,1.0,1.0,0.0,1.0,500.0,0.0,0.0,0.0,12900.0,0.0
4,75.0,2.0,1.0,1.0,1.0,0.0,1.0,1250.0,0.0,0.0,0.0,12900.0,0.0


In [5]:
#check for nulls
df.isnull().sum()

age                0
sex                0
mar                0
education          0
race               0
hispanic           0
offpoor            0
moop_other         0
agi                0
spm_numkids        0
spm_wcohabit       0
spm_totval         0
spm_capwkccxpns    0
dtype: int64

In [6]:
#should we drop duplicates?
df.duplicated().sum()

43533

In [7]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.25, random_state=12)
for train_index, test_index in split.split(df, df['offpoor']):
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

In [8]:
X_train = train_set.drop('offpoor', axis=1)
Y_train = train_set['offpoor']

X_test = test_set.drop('offpoor', axis=1)
Y_test = test_set['offpoor']


In [9]:
#model = GaussianNB()
#model.fit(X_train, Y_train)
#0.58

In [10]:
model = LogisticRegression(solver = 'lbfgs', max_iter=999)
model.fit(X_train, Y_train)
#0.8

LogisticRegression(max_iter=999)

In [11]:
#model = rfc()
#model.fit(X_train, Y_train)
#Overfits

In [12]:
y_pred = model.predict(X_train)
p1_metrics(Y_train, y_pred)

{'auc': 0.8944250232253734,
 'f1': 0.8413149064971022,
 'accuracy': 0.9632935902184989,
 'sensitivity': 0.803546817164862,
 'specificity': 0.9853032292858849,
 'precision': 0.8828084285684975,
 'confusion_matrix': array([[1914657,   28559],
        [  52597,  215136]], dtype=int64)}