In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score

In [8]:

numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')


In [9]:
X = pd.concat([numerical,categorical], axis = 1)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, target.drop('TARGET_D', axis = 1), test_size=0.2)

In [16]:
train_num  = X_train.select_dtypes(include = np.number)
train_cat = X_train.select_dtypes(include = object)

test_num  = X_test.select_dtypes(include = np.number)
test_cat = X_test.select_dtypes(include = object)

In [17]:
from sklearn.preprocessing import MinMaxScaler
# Fitting scaler
transformer = MinMaxScaler().fit(train_num)
# Scaling train and test data
train_num_scaled = pd.DataFrame(transformer.transform(train_num), columns = train_num.columns, index = train_num.index)
test_num_scaled = pd.DataFrame(transformer.transform(test_num), columns = test_num.columns, index = test_num.index)

In [18]:
from sklearn.preprocessing import OneHotEncoder
# Fit encoder
encoder = OneHotEncoder(handle_unknown='ignore').fit(train_cat)
# Getting the column names for the later selection
column_name = encoder.get_feature_names_out(train_cat.columns)
# Encode train and test
train_encoded = pd.DataFrame(encoder.transform(train_cat).toarray(),columns = column_name, index=train_cat.index)
test_encoded = pd.DataFrame(encoder.transform(test_cat).toarray(),columns = column_name, index=test_cat.index)

In [19]:
train_all = pd.concat([train_encoded, train_num_scaled, y_train['TARGET_B']], axis = 1)
X_test = pd.concat([test_encoded,test_num_scaled], axis = 1)

In [20]:
from sklearn.utils import resample
# Splitting into majority and minority class, the 'yes' donors are the minority class.
no = train_all[train_all['TARGET_B']==0]
yes = train_all[train_all['TARGET_B']==1]

In [21]:
# oversample minority
yes_oversampled = resample(yes, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no),#<- make both sets the same size
                                    )

In [22]:
# Concatenate with majority class
train_oversampled = pd.concat([no,yes_oversampled],axis=0)

In [23]:
# X-y Split again
X_train_over = train_oversampled.drop('TARGET_B', axis = 1)
y_train_over = train_oversampled['TARGET_B']

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [26]:
# Defining the classifier
clf = RandomForestClassifier(max_depth=5)

In [27]:
def model_eval(model,X_train,y_train,X_test, y_test):
        # Fitting
        model.fit(X_train, y_train)
        # Predicting
        predictions = model.predict(X_test)
        # Calculating confusion matrix
        cm = confusion_matrix(y_test, predictions)
        # Printing different evaluation metrics
        print( str(model.base_estimator_)[:-2] + " score: ", model.score(X_test, y_test))
        print( str(model.base_estimator_)[:-2] + " precision: ",precision_score(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " recall: ",recall_score(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " f1: ",f1_score(y_test,predictions))
        print('\n')
        print(cm)
        return model

In [28]:
model_eval(clf,X_train_over,y_train_over,X_test, y_test)



DecisionTreeClassifier score:  0.6266834355185243
DecisionTreeClassifier precision:  0.06446629213483146
DecisionTreeClassifier recall:  0.49783080260303686
DecisionTreeClassifier f1:  0.11415070877891073


[[11500  6661]
 [  463   459]]




In [30]:
from sklearn.feature_selection import VarianceThreshold 
# set threshhold
sel = VarianceThreshold(threshold= 0.02)

In [31]:
# Fitting with our numericals
sel = sel.fit(train_num_scaled)

In [32]:
# Check wich columns have low variance
sel.get_support()
# Making it a list
var_list = list(sel.get_support())
# Creating a droplist
droplist_var = [col[0] for col in zip(train_num_scaled.columns, var_list) if col[1] == False]
print(droplist_var)
len(droplist_var)

['TCODE', 'HIT', 'MALEMILI', 'MALEVET', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HU3', 'HU4', 'HHD1', 'HHD4', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC3', 'ETHC4', 'ETHC5', 'ETHC6', 'HUR1', 'RHP1', 'RHP2', 'RHP3', 'RHP4', 'HUPA1', 'HUPA4', 'HUPA5', 'HUPA7', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'HHAS1', '

240

In [33]:
# Trying our model without the low variance columns:
model_eval(clf,X_train_over.drop(droplist_var, axis = 1),y_train_over,X_test.drop(droplist_var, axis = 1), y_test)



DecisionTreeClassifier score:  0.632919352303097
DecisionTreeClassifier precision:  0.0639426523297491
DecisionTreeClassifier recall:  0.4837310195227766
DecisionTreeClassifier f1:  0.11295428643788781


[[11632  6529]
 [  476   446]]


In [34]:
# %%time

# from sklearn.feature_selection import RFE
# from sklearn import linear_model
# # Creating RFE model with Randomforest as Estimator
# rfe = RFE(clf, n_features_to_select=20, verbose=False)

# # Fitting
# rfe.fit(train_all.drop(droplist_var, axis = 1), y_train.values.ravel())

Wall time: 13min 6s


In [37]:

import pickle
pickle.dump(rfe, open('rfe.p', 'wb'))

In [38]:
rfe_l = pickle.load(open('rfe.p','rb'))

In [39]:
rfe_l

In [40]:
# Checking Ranking of columns
df = pd.DataFrame(data = rfe_l.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(train_all.drop(droplist_var, axis = 1)).columns
df = df[df['Column_name'] != 'TARGET_B']
df.sort_values(by = ['Rank'], ascending = False).head(20)

Unnamed: 0,Rank,Column_name
4,103,STATE_IN
17,102,RFA_2R_L
8,101,STATE_TX
22,100,GEOCODE2_A
2,99,STATE_GA
37,98,POP90C1
23,97,GEOCODE2_B
16,96,GENDER_other
1,95,STATE_FL
3,94,STATE_IL


In [41]:
for r in range(1,20,4):
    droplist_rfe = list(df[df['Rank'] < r]['Column_name'])
    droplist_full = droplist_var
    droplist_full.extend(droplist_rfe)
    # Creating dataframes without the eliminated columns
    temp_train = X_train_over.drop(droplist_full, axis = 1).copy()
    temp_test = X_test.drop(droplist_full, axis = 1).copy()
    
    # Checking results:
    print('\n\n\n Testing with columns ranking higher than ' +str(r))
    model_eval(clf,temp_train,y_train_over,temp_test, y_test)




 Testing with columns ranking higher than 1




DecisionTreeClassifier score:  0.6297751925797831
DecisionTreeClassifier precision:  0.06426443467158462
DecisionTreeClassifier recall:  0.4913232104121475
DecisionTreeClassifier f1:  0.11366202484004514


[[11565  6596]
 [  469   453]]



 Testing with columns ranking higher than 5




DecisionTreeClassifier score:  0.5813551328407484
DecisionTreeClassifier precision:  0.05221138005322519
DecisionTreeClassifier recall:  0.44685466377440347
DecisionTreeClassifier f1:  0.09349824123453988


[[10682  7479]
 [  510   412]]



 Testing with columns ranking higher than 9




DecisionTreeClassifier score:  0.5970759314573181
DecisionTreeClassifier precision:  0.05439220334518636
DecisionTreeClassifier recall:  0.447939262472885
DecisionTreeClassifier f1:  0.09700528479154433


[[10981  7180]
 [  509   413]]



 Testing with columns ranking higher than 13




DecisionTreeClassifier score:  0.6089713357438558
DecisionTreeClassifier precision:  0.05413144259612762
DecisionTreeClassifier recall:  0.43058568329718006
DecisionTreeClassifier f1:  0.09617248062015503


[[11224  6937]
 [  525   397]]



 Testing with columns ranking higher than 17




DecisionTreeClassifier score:  0.5846040978881727
DecisionTreeClassifier precision:  0.0545593285005723
DecisionTreeClassifier recall:  0.46529284164859
DecisionTreeClassifier f1:  0.0976664769493455


[[10727  7434]
 [  493   429]]




In [67]:
final_model = clf.fit(temp_train, y_train_over)   

In [68]:
numerical  = X.select_dtypes(include = np.number)
categorical = X.select_dtypes(include = object)

In [69]:
num_scaled = pd.DataFrame(transformer.transform(numerical), columns = train_num.columns, index = numerical.index)

In [70]:
encoded = pd.DataFrame(encoder.transform(categorical).toarray(),columns = column_name, index=categorical.index)

In [71]:
X_all = pd.concat([encoded,num_scaled], axis = 1)

In [72]:
X_all_selected = X_all.drop(droplist_full,axis = 1)

In [73]:
X['predicted_donate'] = clf.predict(X_all_selected)

In [74]:
clf.predict(X_all_selected)

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [75]:
predictions = clf.predict(X_all_selected)
# Calculating confusion matrix
cm = confusion_matrix(target['TARGET_B'], predictions)
# Printing different evaluation metrics
print( str(clf.base_estimator_)[:-2] + " score: ", clf.score(X_all_selected, target['TARGET_B']))
print( str(clf.base_estimator_)[:-2] + " precision: ",precision_score(target['TARGET_B'],predictions))
print( str(clf.base_estimator_)[:-2] + " recall: ",recall_score(target['TARGET_B'],predictions))
print( str(clf.base_estimator_)[:-2] + " f1: ",f1_score(target['TARGET_B'],predictions))
cm

ValueError: Found input variables with inconsistent numbers of samples: [95412, 4843]

In [None]:
df_all = pd.concat([X,target], axis = 1)

In [None]:
df_all

In [64]:
## Lab Case Regression

In [65]:
# Dataframe for building the model
df_regr_build = df_all[df_all['TARGET_B']==1]
# Dataframe for making predictions
df_regr_pred = df_all[df_all['predicted_donate']==1]

KeyError: 'predicted_donate'

In [76]:
X = df_regr_build.drop(['TARGET_D', 'TARGET_B', 'predicted_donate'], axis = 1)
y = df_regr_build['TARGET_D']

KeyError: "['predicted_donate'] not found in axis"

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
train_num  = X_train.select_dtypes(include = np.number)
train_cat = X_train.select_dtypes(include = object)

test_num  = X_test.select_dtypes(include = np.number)
test_cat = X_test.select_dtypes(include = object)

In [57]:
from sklearn.preprocessing import MinMaxScaler
# Fitting scaler
transformer = MinMaxScaler().fit(train_num)
# Scaling train and test data
train_num_scaled = pd.DataFrame(transformer.transform(train_num), columns = train_num.columns, index = train_num.index)
test_num_scaled = pd.DataFrame(transformer.transform(test_num), columns = test_num.columns, index = test_num.index)

In [58]:
train_encoded = pd.concat([train_encoded,train_num_scaled], axis = 1)
test_encoded = pd.concat([test_encoded,test_num_scaled], axis = 1)

In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [60]:
def model_eval_regression(model,X_train,y_train,X_test, y_test):
        # Fitting
        model.fit(X_train, y_train)
        # Predicting
        predictions = model.predict(X_test)
        # Printing different evaluation metrics
        print( str(model.base_estimator_)[:-2] + " R2-score: ", r2_score(y_test, predictions))
        print( str(model.base_estimator_)[:-2] + " MSE: ",mean_squared_error(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " RMSE: ",np.sqrt(mean_squared_error(y_test,predictions)))
        print( str(model.base_estimator_)[:-2] + " MAE: ",mean_absolute_error(y_test, predictions))
        print('\n')
        return model

In [61]:
# Defining a model
RfR = RandomForestRegressor(max_depth=5)
model_eval_regression(RfR,train_encoded,y_tra

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_14668/1940459091.py, line 3)

In [None]:
model_eval_regression(RfR,train_encoded,y_train,test_encoded, y_test)

In [None]:
# Fitting
sel_regr = VarianceThreshold(threshold= 0.01)
sel_regr = sel_regr.fit(train_encoded)

In [None]:
# Check wich columns have low variance
sel_regr.get_support()
# Making it a list
var_list = list(sel_regr.get_support())
# Creating a droplist
droplist_var_regr = [col[0] for col in zip(train_encoded.columns, var_list) if col[1] == False]
print(droplist_var_regr)
len(droplist_var_regr)

In [None]:
# Checking result:
model_eval_regression(RfR, train_encoded.drop(droplist_var_regr, axis = 1), y_train, test_encoded.drop(droplist_var_re

In [None]:
# Continuing with this
train_var = train_encoded.drop(droplist_var_regr, axis = 1).copy()
test_var = test_encoded.drop(droplist_var_regr, axis = 1).copy()

In [None]:
RfR = RandomForestRegressor()

In [None]:
max_depth_choices= [2,3,4,5,6,7,9,10,None]
criterion_choices = ['squared_error']
min_samples_split_choices = [1,2,5,20,50,100,200]
min_samples_leaf_choices = [2,5,20,50,100,200]
max_features_choices = ["auto", "sqrt", "log2", None, 10]



grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices,
               'bootstrap' : [True,False]}

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator = RfR, param_grid = grid, cv = 6)

In [62]:
pickle.dump(grid_search, open('grid_search.p', 'wb'))

NameError: name 'grid_search' is not defined