For this lab, you will be using the CSV files provided in the files_for_lab folder. These are cleaned versions of the learningSet data from the Case Study 'Healthcare for All'.

Instructions: 

- Apply the Random Forests algorithm but this time only by upscaling the data.

- Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)

- Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
numerical=pd.read_csv('numerical.csv')
categorical=pd.read_csv('categorical.csv')
targets=pd.read_csv('target.csv')

## 1) Apply the Random Forests algorithm but this time only by upscaling the data. 

In [5]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [6]:
data.isna().sum().sum()

0

In [7]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# Note: we need to do train/test split before downsampling, and then only downsample the training set - Why?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
from sklearn.utils import resample

# for upsampling we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)

# quicker way to upsample category 1:

category_0 = trainset[trainset['TARGET_B']== 0 ]
category_1_upsampled = resample(trainset[trainset['TARGET_B']==1], replace=True, n_samples = len(category_0))


trainset_new = pd.concat([category_0, category_1_upsampled], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
#data = data.reset_index(drop=True)
print(X_train.shape)

(144972, 355)


In [9]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, min_samples_split=20, min_samples_leaf =20, max_samples=0.2,random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6236997489170323
0.5897919614316407


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10696,  7387],
       [  441,   559]], dtype=int64)

## Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)

### 1) Feature Selection using K Best: Used already on Wednesday Lab therefore would go for different !!
#### Only writing the steps !!

In [None]:
# we will use MinMaxScaler for this
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical)

In [None]:
X = numerical_scaled
y = target['TARGET_B']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
kbest = SelectKBest(chi2, k=10).fit_transform(X, y)
# Here we choose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()

In [None]:
# To check the scores
model = SelectKBest(chi2, k=10).fit(X, y)
df = pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns
print(df.sort_values(by = ['score'], ascending = False).head(10))

In [None]:
cols = df.sort_values(by = ['score'], ascending = False).head(10)['Column']
cols

In [None]:
selected.columns = cols
selected.head()

In [None]:
donors = pd.concat([selected, categorical, target], axis=1)
donors

### 2) Recursive Feature Elimination
##### Only writing the steps !!

In [None]:
X = numerical_scaled
y = targets['TARGET_D']

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=20, verbose=False)
rfe.fit(X, y)

In [None]:
df2 = pd.DataFrame(data={'Column': numerical.columns,'Ranking': rfe.ranking_} )
df2.head()

In [None]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance.
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(numerical).columns
df[df['Rank']<11]
#df['Rank'].value_counts()

## PCA - Principal Component Analysis

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
scaler = StandardScaler()
#scaler.fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#same thing in 1 line
X_train_scaled = scaler.fit_transform(X_train)

In [11]:
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.decomposition import PCA

In [15]:
pca = PCA(0.90)
pca.fit(X_train_scaled)

In [16]:
pca.explained_variance_ratio_

array([0.11745628, 0.08770275, 0.07221561, 0.03914188, 0.03568083,
       0.02730114, 0.0225066 , 0.02089261, 0.01894096, 0.01719657,
       0.01325714, 0.01223543, 0.01131993, 0.01047225, 0.01018715,
       0.00999825, 0.00946273, 0.00875642, 0.00834929, 0.0080928 ,
       0.00762756, 0.00716111, 0.00692991, 0.00658901, 0.00624533,
       0.00596209, 0.00550632, 0.00536359, 0.00523727, 0.00514022,
       0.00510738, 0.00486138, 0.00479492, 0.00465645, 0.00457095,
       0.00448988, 0.00442626, 0.00433231, 0.004279  , 0.00421023,
       0.004146  , 0.00405051, 0.00397396, 0.00393703, 0.0039106 ,
       0.00383419, 0.00374189, 0.00370734, 0.00366294, 0.00359345,
       0.00353545, 0.00342365, 0.00339629, 0.00333324, 0.00326642,
       0.00325456, 0.00323784, 0.00320835, 0.00317515, 0.00309926,
       0.00305887, 0.00303434, 0.00299023, 0.00297024, 0.00294262,
       0.00291292, 0.00289165, 0.00285728, 0.0028549 , 0.00281458,
       0.00280639, 0.00277575, 0.00275813, 0.002751  , 0.00269

In [17]:
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, min_samples_split=20, min_samples_leaf =20, max_samples=0.2,random_state = 42)
clf.fit(X_train_pca, y_train)
print(clf.score(X_train_pca, y_train))
print(clf.score(X_test_pca, y_test))

y_pred = clf.predict(X_test_pca)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6551885881411583
0.673950636692344


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[12434,  5649],
       [  573,   427]], dtype=int64)

In [None]:
# After PCA ==> Random Forest
# 12434 nr. prediction was that there is no donation, & in actual it was also the same. 
# 5649 nr. prediction that there was donation & in actuality it was not

# Before PCA ==> Random Forest
# 10696 nr. prediction was that there is no donation, & in actual it was also the same. 
# 7387 nr. prediction that there was donation & in actuality it was not


# After PCA, results are improved when compared to without PCA !!

In [None]:
# without PCA
0.6236997489170323
0.5897919614316407
0    18083
1     1000
Name: TARGET_B, dtype: int64
array([[10696,  7387],
       [  441,   559]], dtype=int64)

In [None]:
# predicted | 0 | 1 | 
# --------------------------
# actual  0 | + |  | 
# --------------------------
#         1 |   | +

### 3) Discuss the output and its impact in the bussiness scenario.

#### Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm            or data in order to maximize the return of the bussiness?

# Lab | Final regression model in "Health Care for All" Case

## Instructions
#### 1) At this point, we have created a model to predict who will make a donation and who won't. But, what about the ammount of money that each person will give? In this lab, subset those that made a donation and use that subset to create a model to predict how much money will they give.



In [20]:
data1 = pd.concat([numerical, categorical, targets], axis = 1)
data1['TARGET_D'].value_counts()

0.00     90569
10.00      941
15.00      591
20.00      577
5.00       503
         ...  
18.25        1
10.70        1
2.50         1
16.87        1
44.21        1
Name: TARGET_D, Length: 71, dtype: int64

In [21]:
data_d= data1[data1['TARGET_B']==1]

In [22]:
data_d

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,2,62.000000,3,8,10,2,25,40,27,11,4,1,9,2707,672,929,99,0,0,45,55,97,1,0,2,1,0,0,0,0,1,0,0,1,0,0,0,35,42,43,37,47,50,27,34,43,23,9,20,27,16,8,6,15,15,13,37,21,14,16,8,15,23,30,47,29,11,2,55,10,13,22,190,267,89,49,1,10,10,8,8,8,0,902,960,8,8,88,12,97,3,11,42,72,60,34,79,21,8,1,7,12,21,5,21,58,17,0,1,0,0,4,35,74,99,0,1,64,63,62,14,4,2,8,0,5,1,7,0,95,96,96,96,5120.0,107.0,613.0,468,501,540,580,20328,6,8,19,23,23,11,7,1,2,4,6,13,27,25,14,8,1,1,14,2,66,1,61,39,20,74,17,2,2,0,0,0,2,4,1,75,20,21,1,67,71,82,62,80,60,80,65,99,0,1,21,20,8,11,17,1,1,10,0,5,3,2,1,0,0,3,23,5,2,4,12,7,3,2,1,14,12,7,5,11,4,1,4,70,10,0,140,5,3,24,21,9,32,7,3,23,3,16,7,1,2,0,12,25,3,40,20,27,7,0,0,2,12,0,0,3,1,4,1,0,0,0,4,0,1,58,95,1,0,4,95,69,23,4,13,2,17,35,91,99,1,12,3,93,0,5,1,0,1,99,1,97,99,99,9,2,6,3,26,65,5,12,61.00,15,10,2.00,7.0,5.0,12,4.066667,82943,1,3,3,other,12,H,F,3,L,D,A,S,1,87,1,36,1,88,1,94,4,96,3,87,1,1,4.0
30,0,61.611649,5,9,0,1,37,58,16,8,1,5,9,2147,591,640,99,0,0,49,51,94,2,0,3,5,1,0,1,1,0,0,0,3,0,0,2,29,36,37,27,38,40,37,46,37,17,8,34,34,15,6,2,1,18,21,31,19,11,5,0,3,7,23,71,47,15,6,77,5,1,17,286,334,99,99,0,0,0,0,0,0,0,1002,1166,7,5,97,3,96,4,3,63,92,88,59,97,3,4,1,3,5,4,1,30,60,4,1,2,0,4,15,50,94,99,2,0,91,70,72,20,4,0,0,0,3,0,0,0,78,83,99,99,2920.0,201.0,618.0,633,638,652,663,19703,2,5,1,17,48,19,7,0,1,3,4,0,16,51,19,7,0,1,5,0,65,3,74,26,25,89,9,0,0,0,0,0,1,2,1,82,34,42,22,70,82,98,67,97,66,68,61,99,99,0,34,23,4,10,12,0,2,9,0,4,1,1,0,0,4,2,25,5,2,4,8,8,8,0,3,10,8,9,4,8,1,5,5,76,4,1,160,0,1,12,25,10,39,13,5,33,6,25,7,1,1,0,19,37,2,58,11,16,10,0,4,1,15,0,0,3,2,0,2,0,0,0,1,0,3,48,97,1,1,1,99,91,24,3,6,6,57,85,95,95,5,0,0,75,0,20,0,0,5,96,4,90,99,99,12,4,1,6,10,24,0,2,68.00,11,6,2.00,10.0,7.0,9,6.181818,190313,1,3,14,TX,35,H,M,3,L,D,A,T,1,90,1,0,2,90,4,93,1,95,12,90,4,1,7.0
45,0,66.000000,5,9,5,0,33,24,39,6,5,1,9,2160,683,900,89,0,11,48,52,99,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,42,47,49,40,48,50,20,33,42,25,7,14,23,21,17,13,5,14,13,33,22,18,24,7,22,22,43,35,19,6,1,72,7,5,16,165,240,69,58,1,28,27,4,0,0,0,1282,1399,5,4,96,4,97,3,8,27,76,71,25,89,11,2,0,2,9,18,2,16,62,21,0,0,0,15,33,74,88,98,2,0,70,67,67,13,3,28,0,0,2,2,0,0,39,57,71,89,1360.0,173.0,637.0,550,637,607,703,26007,5,11,19,10,24,17,5,3,5,1,5,16,12,28,22,6,5,6,27,1,76,1,61,39,12,89,7,0,0,0,0,0,0,3,1,5,13,16,3,39,67,79,56,78,52,67,67,0,0,0,25,24,4,13,12,0,1,6,0,9,1,3,1,1,0,3,21,2,4,3,19,7,5,3,2,7,12,8,4,6,5,1,11,70,6,0,151,2,4,21,20,7,31,15,7,14,4,13,4,0,0,0,16,33,1,24,26,39,3,0,7,0,22,0,0,5,0,1,0,0,0,0,0,1,2,66,97,0,0,2,98,67,23,4,11,8,40,49,81,93,7,36,0,82,7,8,4,0,0,84,16,78,99,97,11,3,9,8,31,74,5,13,102.00,21,14,3.00,6.0,5.0,3,4.857143,76585,1,3,11,other,24,H,F,3,L,D,C,C,1,86,1,31,10,93,12,94,4,96,2,87,4,1,5.0
78,0,69.000000,6,9,0,0,34,20,54,2,3,1,9,13801,3736,6388,99,0,0,48,52,97,0,0,3,3,0,1,1,0,0,0,0,2,0,0,1,40,44,46,42,47,49,14,40,37,23,8,21,21,17,13,13,7,18,15,30,19,18,26,9,24,29,44,27,13,4,1,54,13,6,27,148,216,78,58,8,20,13,8,0,0,0,5000,5471,10,10,63,37,94,6,30,18,59,50,15,72,28,4,1,2,20,27,13,11,63,23,0,0,0,99,99,99,99,99,95,4,51,59,56,12,4,15,5,0,20,10,6,0,92,94,95,96,5945.0,13.0,803.0,738,963,797,959,50907,9,7,8,11,15,13,10,6,20,4,4,5,6,16,17,11,9,28,23,1,60,5,55,45,9,85,6,0,0,0,0,0,2,7,1,10,18,25,7,63,67,78,58,76,56,55,44,73,95,3,24,29,2,25,9,1,0,4,1,3,0,1,1,1,0,5,12,2,1,5,13,21,7,3,3,9,7,12,1,4,3,0,16,72,4,1,160,1,3,9,23,7,36,21,4,17,1,10,11,0,0,0,16,32,1,23,18,45,5,1,9,1,10,0,1,6,3,1,1,0,1,1,1,0,9,48,91,3,2,5,97,67,23,5,27,0,9,14,38,63,37,8,1,86,1,12,0,0,1,99,0,99,99,99,8,3,7,6,22,61,6,13,132.00,12,5,5.00,17.0,10.0,21,11.000000,156378,0,2,2,CA,13,H,F,2,L,F,A,S,1,90,1,28,7,90,1,95,3,95,11,90,1,1,13.0
93,1,73.000000,1,7,10,0,21,53,8,5,4,11,7,1673,418,462,99,0,0,49,51,7,93,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,39,42,31,40,45,30,32,41,26,18,18,29,20,9,4,3,14,13,29,26,18,14,2,10,8,21,72,50,24,13,47,15,4,34,300,358,98,98,0,0,0,0,0,0,0,685,698,5,5,89,11,98,2,0,55,90,60,36,94,6,19,2,16,7,15,2,0,3,3,24,65,4,0,0,4,32,90,0,0,84,67,68,20,5,0,0,0,10,0,0,0,69,84,98,99,520.0,197.0,524.0,409,422,436,463,12546,8,13,13,35,20,10,1,0,0,4,10,15,37,21,12,1,0,0,27,3,32,4,39,61,7,71,12,14,14,0,0,0,0,2,0,57,31,35,11,77,78,91,68,78,66,73,46,54,99,18,7,18,3,8,17,0,4,14,0,8,5,10,6,0,0,3,15,10,7,2,17,8,6,2,0,9,6,4,11,5,4,11,3,70,6,0,120,5,17,38,17,1,16,7,8,21,3,20,6,0,0,0,10,21,0,53,18,8,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,71,97,1,0,2,96,76,31,13,28,0,0,0,8,66,34,0,0,81,2,17,0,0,0,99,0,98,99,99,7,2,7,3,19,46,6,14,94.00,10,8,5.00,12.0,12.0,6,9.400000,25641,1,3,22,GA,18,H,M,3,L,E,A,S,2,92,1,24,10,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,2,3,2,2649,671,1098,0,99,1,46,54,94,1,1,1,8,0,0,0,0,0,0,0,6,0,0,1,39,50,55,42,52,55,23,45,37,18,10,17,16,11,11,14,20,18,19,33,16,14,37,17,35,33,37,30,17,7,3,49,17,16,17,147,226,79,75,6,20,14,8,6,6,0,653,752,4,4,57,43,88,12,33,28,61,44,17,72,28,11,3,9,18,27,6,18,45,30,0,1,0,2,5,14,33,83,1,10,18,43,44,12,4,16,4,1,23,11,8,0,13,38,79,94,6780.0,13.0,803.0,194,218,252,292,12177,40,21,13,17,7,1,1,1,0,26,29,16,20,6,0,1,2,0,36,14,34,17,60,40,13,79,19,0,0,0,0,0,0,2,0,31,17,26,9,47,41,54,31,50,29,44,42,35,0,20,12,16,4,8,11,0,5,16,1,17,2,1,7,0,0,19,5,5,5,2,15,7,6,5,3,11,3,8,5,9,2,3,12,68,6,0,120,11,24,30,20,7,5,2,2,19,1,15,5,0,0,0,20,45,2,28,12,37,12,2,5,2,12,0,0,5,2,1,1,0,0,1,1,0,5,35,95,3,0,2,92,42,15,4,22,0,8,12,43,71,29,0,0,65,11,20,0,0,3,94,4,17,99,95,6,2,8,8,33,81,6,13,238.07,30,16,0.07,17.0,17.0,7,7.935667,154544,0,1,52,CA,36,H,F,3,L,F,A,T,2,86,1,53,4,89,6,96,1,96,1,86,8,1,20.0
95309,0,51.000000,5,6,1,1,32,43,24,7,5,6,6,8361,2324,3112,99,0,0,50,50,90,1,1,7,9,1,1,1,3,0,0,0,6,0,0,2,30,35,37,31,41,43,28,50,36,14,10,36,24,11,8,8,4,22,20,33,15,10,17,5,14,19,36,46,27,9,3,66,12,4,18,187,266,70,68,1,22,21,19,1,0,1,1693,1692,5,6,72,28,87,13,8,42,75,64,35,85,15,7,2,5,14,15,7,22,57,11,0,1,0,23,68,91,97,99,0,4,49,56,55,14,4,14,8,7,12,3,13,0,58,80,93,97,6920.0,67.0,862.0,423,467,457,492,17493,8,13,16,28,27,6,3,0,1,5,11,13,28,32,7,2,0,1,16,6,48,4,86,14,17,83,11,0,0,0,0,0,3,2,0,59,22,26,5,67,77,86,68,82,66,76,56,92,99,1,13,18,8,16,15,0,2,8,1,13,3,2,2,1,0,10,14,4,2,3,18,9,5,2,1,8,6,6,9,7,5,6,8,70,4,0,140,2,9,23,25,12,22,6,5,22,3,17,7,1,1,2,16,32,1,43,8,24,16,1,4,1,10,1,1,6,3,1,1,1,0,1,1,0,6,58,90,3,3,3,98,71,19,1,4,36,76,81,85,86,14,1,1,68,1,31,0,0,0,99,0,99,99,98,12,2,3,4,13,36,4,10,35.00,3,2,5.00,15.0,15.0,4,11.666667,171302,1,1,20,CA,12,H,F,3,L,F,B,S,1,94,1,47,1,93,10,94,2,95,12,93,10,1,15.0
95398,0,86.000000,5,9,0,1,32,21,26,9,1,0,9,2368,651,930,99,0,0,50,50,85,12,0,3,1,1,0,1,1,0,0,0,1,0,0,1,36,42,45,37,44,47,21,34,39,27,12,21,21,19,14,9,4,16,13,28,24,19,18,5,16,26,33,41,25,10,4,61,7,4,28,172,254,69,65,0,30,30,29,0,0,0,934,975,5,5,75,25,98,2,10,29,70,64,27,85,15,3,1,2,16,17,4,14,57,14,2,9,1,1,2,37,86,99,0,3,58,61,57,14,4,7,24,0,2,1,22,0,30,94,97,97,5080.0,111.0,617.0,476,529,511,562,20261,6,9,16,21,31,12,3,1,1,2,8,17,18,34,15,4,1,1,25,2,74,1,32,68,5,87,8,0,0,0,0,0,2,2,1,18,20,22,2,67,75,80,70,78,69,83,60,89,0,3,25,9,4,13,11,0,3,7,0,14,8,3,3,1,1,3,23,2,2,5,19,5,7,1,2,8,12,6,2,9,1,0,5,76,9,0,140,4,4,27,23,8,20,14,5,20,0,15,9,0,1,0,16,32,0,21,26,26,5,0,1,0,34,1,0,1,1,0,1,0,0,0,0,0,5,72,92,1,3,4,98,72,24,9,19,0,1,13,53,89,11,12,7,79,0,18,2,0,0,99,0,99,99,99,8,4,8,5,29,71,6,16,144.00,10,4,5.00,25.0,20.0,15,14.400000,78831,0,3,3,WI,11,H,F,3,L,G,B,S,1,86,1,11,10,89,6,95,11,96,2,87,11,1,3.0
95403,0,58.000000,4,9,0,0,24,46,20,6,1,2,5,1663,450,581,0,1,99,50,50,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,40,43,33,44,47,32,37,43,20,13,24,23,13,10,8,9,13,17,33,24,12,23,11,23,20,30,51,34,15,4,68,5,6,20,205,286,81,81,2,10,9,7,0,0,0,585,606,3,2,79,21,97,3,13,45,77,70,40,90,10,6,1,4,12,12,3,28,57,15,0,0,0,0,0,8,25,62,0,1,65,64,63,15,4,7,4,8,9,3,7,2,0,2,23,82,0.0,107.0,613.0,281,342,326,376,11157,27,16,17,22,12,4,1,0,0,14,18,19,26,15,5,2,0,0,23,2,46,9,38,62,4,69,15,0,0,0,0,0,4,12,0,53,22,23,1,71,71,78,63,75,62,78,66,79,95,3,9,5,7,10,15,1,2,13,15,12,6,4,3,16,0,6,12,5,1,11,12,3,4,3,1,18,4,4,2,6,1,2,17,59,13,2,120,12,9,45,12,9,11,2,2,29,3,24,4,0,0,0,13,25,1,43,19,21,8,0,1,0,44,0,0,3,0,6,0,0,0,0,1,0,0,85,97,1,0,2,97,75,27,9,28,3,10,17,40,50,50,0,0,35,28,11,16,0,10,51,48,46,99,97,7,4,5,6,22,51,4,8,139.00,12,6,3.00,20.0,20.0,10,11.583333,84678,0,1,56,other,49,H,F,2,L,F,D,R,2,90,1,40,1,90,3,93,12,96,1,90,3,1,10.0


In [103]:
data_d['TARGET_D'].info

<bound method Series.info of 20        4.0
30        7.0
45        5.0
78       13.0
93       10.0
         ... 
95298    20.0
95309    15.0
95398     3.0
95403    10.0
95410    18.0
Name: TARGET_D, Length: 4843, dtype: float64>

In [86]:
y= data_d ['TARGET_D']
X= data_d.drop(['TARGET_B', 'TARGET_D'], axis = 1)

In [87]:
y.shape

(4843,)

In [88]:
X.shape

(4843, 337)

In [89]:
X = pd.DataFrame(X)
X.shape

(4843, 337)

In [90]:
X_numerical = X.select_dtypes(np.number)
X_categorical = X.select_dtypes(np.object)

# X_categorical.shape
X_numerical.shape

(4843, 330)

In [92]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_categorical)
encoded_categorical = encoder.transform(X_categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
# encoded_categorical.shape
encoded_categorical = encoded_categorical.reset_index(drop=True)
X_numerical = X_numerical.reset_index(drop=True)
X_concat = pd.concat([X_numerical, encoded_categorical], axis = 1)
X_concat.shape

(4843, 354)

In [110]:
model1 = DecisionTreeRegressor()
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = StandardScaler()
X = scaler.fit_transform(X_concat)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_name] = mean_score
print(scores)

# We can use the result to choose the best performing model

{'Decision Tree Regressor': -0.062380326806291554, 'Linear Regression': 0.26404664726574784, 'KNN': 0.05137267026692192}


In [109]:
from sklearn import neighbors
clf = neighbors.KNeighborsRegressor(n_neighbors=7, weights='uniform')
clf.fit(X_train, y_train)
predictions_clf = clf.predict(X_val)
clf.score(X_val, y_val)

0.16412563993815044

#### 2) Evaluate the result of your model and estimate how much better the result are for the bussiness in comparison with the naive scenario we discuss on Monday.