# Lab | Final regression model in "Health Care for All" Case

In [205]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [206]:
data.shape

(95412, 339)

In [207]:
# downsampling
from sklearn.utils import resample

category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

category_0_undersampled = resample(category_0, replace=False, n_samples = len(category_1))
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)
data_downsampled = data_downsampled.sample(frac =1) #randomize the rows
data_downsampled = data_downsampled.reset_index(drop=True)


In [208]:
#OneHotEncoding
y = data_downsampled['TARGET_B']
X = data_downsampled.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [209]:
# Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=30, verbose=False)
rfe.fit(X, y)

# After we run the algorithm, it labels the top features as 1 and the rest are marked in an decreasing order of importance.
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(X).columns

#only select these 30 columns
X = X[list(df[df['Rank']==1].Column_name)]
X.head()

Unnamed: 0,POP90C4,POP90C5,ETH1,DW3,HHD5,HHD6,ETHC1,ETHC2,ETHC3,HUPA2,ANC6,RFA_2F,ODATEW_MM,LASTDATE_YR,TARGET_D,0,1,2,3,5,8,10,11,13,14,15,16,18,19,20
0,48,52,83,7,79,21,18,49,16,8,0,1,1,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,49,51,82,0,88,12,12,41,30,0,0,1,1,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,48,52,97,1,89,11,23,58,16,8,0,1,1,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,40,60,93,0,61,39,3,38,52,0,0,3,1,95,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,50,50,83,0,95,5,27,54,2,0,0,1,1,95,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [210]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [211]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [212]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.6037687145069696
0.564499484004128


In [215]:
# For cross validation
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

print(np.mean(cross_val_scores))

0.5782182212219722


In [216]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print('Model accuracy', accuracy_score(y_test, y_pred, normalize=True))
print(report)

Model accuracy 0.5588235294117647
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       949
           1       0.57      0.52      0.55       989

    accuracy                           0.56      1938
   macro avg       0.56      0.56      0.56      1938
weighted avg       0.56      0.56      0.56      1938



In [217]:
predicted_TARGET_B = pd.Series(clf.predict(X.drop(['TARGET_D'], axis = 1)),name='predicted_TARGET_B')
predicted_TARGET_B

0       0
1       0
2       0
3       1
4       0
       ..
9681    0
9682    0
9683    0
9684    0
9685    0
Name: predicted_TARGET_B, Length: 9686, dtype: int64

In [218]:
data = pd.concat([X,predicted_TARGET_B],axis=1)

In [219]:
data.predicted_TARGET_B.value_counts()

0    5216
1    4470
Name: predicted_TARGET_B, dtype: int64

In [220]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[564, 385],
       [470, 519]], dtype=int64)

In [221]:
#only selecting the data that the model has predicted will make a donation

data_regression = data[data['predicted_TARGET_B'] == 1].drop(['predicted_TARGET_B'], axis=1)
data_regression.shape

(4470, 30)

In [222]:
data_regression.head()

Unnamed: 0,POP90C4,POP90C5,ETH1,DW3,HHD5,HHD6,ETHC1,ETHC2,ETHC3,HUPA2,ANC6,RFA_2F,ODATEW_MM,LASTDATE_YR,TARGET_D,0,1,2,3,5,8,10,11,13,14,15,16,18,19,20
3,40,60,93,0,61,39,3,38,52,0,0,3,1,95,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,60,40,84,1,31,69,5,72,7,73,0,3,1,96,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,42,58,92,1,61,39,8,51,33,43,0,4,1,96,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,49,51,73,1,81,19,9,40,24,0,0,3,1,96,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13,48,52,94,0,95,5,25,58,11,0,0,2,1,95,11.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [223]:
data_regression.TARGET_D.value_counts()

0.00      1761
10.00      645
5.00       434
15.00      215
20.00      151
12.00      135
7.00       123
6.00       123
8.00       109
11.00      107
3.00        80
4.00        76
25.00       73
9.00        70
14.00       59
13.00       53
16.00       50
17.00       25
2.00        19
30.00       15
21.00       15
18.00       14
50.00       13
19.00       11
26.00       11
23.00       10
12.50       10
22.00        7
36.00        6
1.00         5
45.00        4
24.00        4
27.00        4
40.00        3
100.00       3
32.00        3
37.00        2
35.00        2
28.00        2
31.00        2
53.00        2
38.00        2
42.00        1
2.50         1
75.00        1
44.00        1
48.00        1
7.50         1
102.00       1
4.50         1
33.00        1
101.00       1
95.00        1
13.92        1
Name: TARGET_D, dtype: int64

In [224]:
X = data_regression.drop(['TARGET_D'], axis = 1)
y = data_regression['TARGET_D']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [225]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor()
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)

{'Decision Tree Regressor': -1.144123733775993, 'Linear Regression': 0.09141463322669251, 'KNN': -0.20686866339534565}
