In [1]:
# Importing libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Importing data (excluding TARGET_D)
donors = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Downloads_Ironhack\learningSet.txt")
donors = donors.drop("TARGET_D", axis = 1)
donors

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_B,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,1,1,L,4,F,X,X,X,11.0,A


In [3]:
# Creating a new dataframe with numerical data and dropping null values 
# I decided to perform the model with only numerical data since there are already hundreds of numerical variables
numerical = donors.select_dtypes(include=np.number)
numerical = numerical.dropna(axis = 1)
numerical

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,TARGET_B,HPHONE_D,RFA_2F
0,8901,0,3712,0,0,39,34,18,10,2,...,12.0,9402,10.0,9512,8911,7.741935,95515,0,0,4
1,9401,1,5202,16,0,15,55,11,6,2,...,25.0,9512,25.0,9512,9310,15.666667,148535,0,0,2
2,9001,1,0,2,0,20,29,33,6,8,...,16.0,9207,5.0,9512,9001,7.481481,15078,0,1,4
3,8701,0,2801,2,0,23,14,31,3,0,...,11.0,9411,10.0,9512,8702,6.812500,172556,0,1,4
4,8601,0,2001,60,1,28,9,53,26,3,...,15.0,9601,15.0,9601,7903,6.864865,7112,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,1,0,0,14,36,47,11,7,8,...,25.0,9602,25.0,9602,9602,25.000000,184568,0,0,1
95408,9601,1,5001,1,0,31,43,19,4,1,...,20.0,9603,20.0,9603,9603,20.000000,122706,0,1,1
95409,9501,1,3801,0,0,18,46,20,7,23,...,10.0,9501,10.0,9610,9410,8.285714,189641,0,1,3
95410,8601,0,4005,0,0,28,35,20,9,1,...,21.0,9608,18.0,9701,8612,12.146341,4693,1,1,4


In [4]:
# Checking null values
numerical.isna().sum().sum()

0

In [5]:
# Defining X and y variables
X = numerical.drop("TARGET_B",axis = 1)
y = numerical["TARGET_B"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Train dataframe
df_train = pd.concat([X_train, y_train], axis = 1)
df_train = df_train.reset_index(drop = True)
df_train

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,9601,28,0,0,0,29,36,21,7,1,...,15.0,9602,15.0,9602,9602,15.000000,90621,1,1,0
1,9001,0,3401,0,0,23,23,28,2,4,...,30.0,9211,15.0,9509,9010,14.545455,18950,0,2,0
2,8801,1,3501,2,0,25,27,33,3,4,...,10.0,9601,10.0,9601,8809,5.687500,59041,1,2,0
3,8801,2,2601,0,0,34,19,30,7,4,...,20.0,9411,10.0,9506,8808,6.888889,82274,1,1,0
4,8901,0,0,0,0,37,46,38,6,0,...,17.0,9512,17.0,9512,8910,11.857143,140742,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,9509,0,0,0,0,0,0,0,0,0,...,25.0,9509,25.0,9509,9509,25.000000,165969,0,1,0
76325,9301,1,3601,6,2,40,31,32,6,1,...,16.0,9403,14.0,9512,9301,12.500000,60296,0,2,0
76326,9101,0,5401,0,2,40,55,14,5,7,...,20.0,9412,15.0,9505,9106,11.583333,16765,0,2,0
76327,9401,0,0,0,0,30,22,47,8,4,...,15.0,9601,15.0,9601,9402,9.000000,105296,1,1,0


In [6]:
# Test dataframe
df_test = pd.concat([X_test, y_test], axis = 1)
df_test = df_test.reset_index(drop = True)
df_test

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,9601,1,5801,11,0,28,33,19,3,14,...,25.0,9509,25.0,9602,9509,25.000000,133586,1,2,0
1,9101,0,2303,0,1,20,33,34,10,3,...,10.0,9509,10.0,9509,9109,6.538462,150220,0,3,0
2,9401,0,4601,8,0,39,19,8,13,1,...,6.0,9409,4.0,9508,9401,4.700000,67579,0,4,0
3,9301,0,4401,0,0,26,20,24,15,2,...,20.0,9412,20.0,9507,9301,15.000000,147358,0,1,0
4,9101,1,4201,2,0,52,8,76,9,2,...,25.0,9502,25.0,9511,9103,15.750000,135732,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,9401,2,0,0,0,33,30,34,4,3,...,20.0,9506,7.0,9509,9402,8.500000,48008,0,3,0
19079,9401,1,3201,0,0,8,0,26,0,0,...,20.0,9508,20.0,9512,9401,13.000000,121222,0,2,0
19080,9401,0,4701,0,4,35,34,42,11,2,...,20.0,9507,20.0,9507,9404,10.666667,108280,1,1,0
19081,9601,0,5901,1,1,29,31,44,5,1,...,20.0,9509,20.0,9603,9509,20.000000,175377,0,3,0


In [7]:
# Standardizing X_train variables
scaler = StandardScaler()
X_train_stand = scaler.fit_transform(df_train.drop("TARGET_B", axis = 1))
X_train_stand = pd.DataFrame(X_train_stand, columns = df_train.drop("TARGET_B", axis = 1).columns)
X_train_stand

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MINRDATE,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F
0,1.337488,-0.027017,-1.276055,-0.361560,-0.207353,-0.125646,0.417324,-0.658816,0.034684,-0.696876,...,1.307347,-0.188132,0.927558,-0.167504,1.095727,1.452087,0.154316,-0.096287,0.998678,-0.849602
1,-0.408079,-0.056202,0.318579,-0.361560,-0.207353,-0.650032,-0.444213,-0.263431,-1.109011,-0.111252,...,-0.897900,0.375719,-1.333416,-0.167504,-0.794295,-0.390478,0.112266,-1.393723,-1.001324,0.082301
2,-0.989934,-0.055160,0.365466,-0.141287,-0.207353,-0.475237,-0.179124,0.018987,-0.880272,-0.111252,...,-0.191473,-0.376083,0.921775,-0.533228,1.075404,-1.016079,-0.707178,-0.667969,0.998678,0.082301
3,-0.989934,-0.054118,-0.056519,-0.361560,-0.207353,0.311343,-0.709301,-0.150464,0.034684,-0.111252,...,0.556068,-0.000182,-0.176908,-0.533228,-0.855263,-1.019192,-0.596038,-0.247390,0.998678,-0.849602
4,-0.699007,-0.056202,-1.276055,-0.361560,-0.207353,0.573536,1.080045,0.301405,-0.194055,-0.892084,...,-1.279146,-0.112952,0.407129,-0.021215,-0.733327,-0.701723,-0.136428,0.811036,0.998678,-0.849602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,1.069834,-0.056202,-1.276055,-0.361560,-0.207353,-2.660181,-1.968470,-1.844971,-1.566490,-0.892084,...,0.959740,0.187769,0.389781,0.563944,-0.794295,1.162630,1.079409,1.267712,-1.001324,-0.849602
76325,0.464704,-0.055160,0.412353,0.299261,0.192699,0.835730,0.085964,-0.037497,-0.194055,-0.696876,...,0.182297,-0.150542,-0.223168,-0.240649,-0.733327,0.515242,-0.076958,-0.645250,-1.001324,0.082301
76326,-0.117151,-0.056202,1.256323,-0.361560,0.192699,0.835730,1.676493,-1.054201,-0.422794,0.474372,...,-0.546555,-0.000182,-0.171125,-0.167504,-0.875586,-0.091684,-0.161758,-1.433277,-1.001324,0.082301
76327,0.755632,-0.056202,-1.276055,-0.361560,-0.207353,-0.038248,-0.510485,0.809757,0.263424,-0.111252,...,0.559806,-0.188132,0.921775,-0.167504,1.075404,0.829599,-0.400741,0.169369,0.998678,-0.849602


In [8]:
# Standardizing X_test variables
scaler = StandardScaler()
X_test_stand = scaler.fit_transform(df_test.drop("TARGET_B", axis = 1))
X_test_stand = pd.DataFrame(X_test_stand, columns = df_test.drop("TARGET_B", axis = 1).columns)
X_test_stand

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MINRDATE,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F
0,1.341479,-0.058451,1.440904,0.740300,-0.205895,-0.213755,0.218088,-0.763405,-0.854129,1.835258,...,1.300274,0.284628,0.382788,0.504873,1.090775,1.176152,1.092328,0.693450,0.999110,0.090044
1,-0.119012,-0.059525,-0.201334,-0.341991,-0.017981,-0.904366,0.218088,0.082742,0.711782,-0.304003,...,-0.162116,-0.565788,0.382788,-0.492307,-0.793582,-0.086591,-0.647829,0.993408,-1.000891,1.023401
2,0.757283,-0.059525,0.877529,0.445130,-0.205895,0.735834,-0.707161,-1.383912,1.382886,-0.692959,...,0.548508,-0.792566,-0.196566,-0.891180,-0.813844,0.835212,-0.821120,-0.496842,-1.000891,1.956758
3,0.465185,-0.059525,0.783633,-0.341991,-0.205895,-0.386408,-0.641072,-0.481356,1.830289,-0.498481,...,0.174495,0.001156,-0.179185,0.172480,-0.834106,0.519526,0.149743,0.941798,-1.000891,-0.843313
4,-0.119012,-0.058451,0.689737,-0.145211,-0.205895,1.858076,-1.434143,2.451953,0.488080,-0.498481,...,-0.169597,0.284628,0.342233,0.504873,-0.753058,-0.105532,0.220437,0.732149,0.999110,1.023401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.757283,-0.057377,-1.282545,-0.341991,-0.205895,0.217876,0.019820,0.082742,-0.630427,-0.304003,...,0.552248,0.001156,0.365408,-0.691744,-0.793582,0.838369,-0.462937,-0.849762,-1.000891,1.023401
19079,0.757283,-0.058451,0.220258,-0.341991,-0.205895,-1.940281,-1.962857,-0.368536,-1.525233,-0.887437,...,0.548508,0.001156,0.376995,0.172480,-0.732797,0.835212,-0.038774,0.470492,-1.000891,0.090044
19080,0.757283,-0.059525,0.924477,-0.341991,0.545762,0.390529,0.284177,0.534020,0.935483,-0.498481,...,0.559729,0.001156,0.371201,0.172480,-0.834106,0.844682,-0.258710,0.237112,0.999110,-0.843313
19081,1.341479,-0.059525,1.487852,-0.243601,-0.017981,-0.127429,0.085909,0.646840,-0.406726,-0.692959,...,1.304014,0.001156,0.382788,0.172480,1.111037,1.176152,0.621036,1.447060,-1.000891,1.023401


In [9]:
# Gathering all "Train" variables together again
df_train = pd.concat([X_train_stand, df_train["TARGET_B"]], axis = 1)
df_train

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,1.337488,-0.027017,-1.276055,-0.361560,-0.207353,-0.125646,0.417324,-0.658816,0.034684,-0.696876,...,-0.188132,0.927558,-0.167504,1.095727,1.452087,0.154316,-0.096287,0.998678,-0.849602,0
1,-0.408079,-0.056202,0.318579,-0.361560,-0.207353,-0.650032,-0.444213,-0.263431,-1.109011,-0.111252,...,0.375719,-1.333416,-0.167504,-0.794295,-0.390478,0.112266,-1.393723,-1.001324,0.082301,0
2,-0.989934,-0.055160,0.365466,-0.141287,-0.207353,-0.475237,-0.179124,0.018987,-0.880272,-0.111252,...,-0.376083,0.921775,-0.533228,1.075404,-1.016079,-0.707178,-0.667969,0.998678,0.082301,0
3,-0.989934,-0.054118,-0.056519,-0.361560,-0.207353,0.311343,-0.709301,-0.150464,0.034684,-0.111252,...,-0.000182,-0.176908,-0.533228,-0.855263,-1.019192,-0.596038,-0.247390,0.998678,-0.849602,0
4,-0.699007,-0.056202,-1.276055,-0.361560,-0.207353,0.573536,1.080045,0.301405,-0.194055,-0.892084,...,-0.112952,0.407129,-0.021215,-0.733327,-0.701723,-0.136428,0.811036,0.998678,-0.849602,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,1.069834,-0.056202,-1.276055,-0.361560,-0.207353,-2.660181,-1.968470,-1.844971,-1.566490,-0.892084,...,0.187769,0.389781,0.563944,-0.794295,1.162630,1.079409,1.267712,-1.001324,-0.849602,0
76325,0.464704,-0.055160,0.412353,0.299261,0.192699,0.835730,0.085964,-0.037497,-0.194055,-0.696876,...,-0.150542,-0.223168,-0.240649,-0.733327,0.515242,-0.076958,-0.645250,-1.001324,0.082301,0
76326,-0.117151,-0.056202,1.256323,-0.361560,0.192699,0.835730,1.676493,-1.054201,-0.422794,0.474372,...,-0.000182,-0.171125,-0.167504,-0.875586,-0.091684,-0.161758,-1.433277,-1.001324,0.082301,0
76327,0.755632,-0.056202,-1.276055,-0.361560,-0.207353,-0.038248,-0.510485,0.809757,0.263424,-0.111252,...,-0.188132,0.921775,-0.167504,1.075404,0.829599,-0.400741,0.169369,0.998678,-0.849602,0


In [10]:
# Gathering all "Test" variables together again
df_test = pd.concat([X_test_stand, df_test["TARGET_B"]], axis = 1)
df_test

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,1.341479,-0.058451,1.440904,0.740300,-0.205895,-0.213755,0.218088,-0.763405,-0.854129,1.835258,...,0.284628,0.382788,0.504873,1.090775,1.176152,1.092328,0.693450,0.999110,0.090044,0
1,-0.119012,-0.059525,-0.201334,-0.341991,-0.017981,-0.904366,0.218088,0.082742,0.711782,-0.304003,...,-0.565788,0.382788,-0.492307,-0.793582,-0.086591,-0.647829,0.993408,-1.000891,1.023401,0
2,0.757283,-0.059525,0.877529,0.445130,-0.205895,0.735834,-0.707161,-1.383912,1.382886,-0.692959,...,-0.792566,-0.196566,-0.891180,-0.813844,0.835212,-0.821120,-0.496842,-1.000891,1.956758,0
3,0.465185,-0.059525,0.783633,-0.341991,-0.205895,-0.386408,-0.641072,-0.481356,1.830289,-0.498481,...,0.001156,-0.179185,0.172480,-0.834106,0.519526,0.149743,0.941798,-1.000891,-0.843313,0
4,-0.119012,-0.058451,0.689737,-0.145211,-0.205895,1.858076,-1.434143,2.451953,0.488080,-0.498481,...,0.284628,0.342233,0.504873,-0.753058,-0.105532,0.220437,0.732149,0.999110,1.023401,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.757283,-0.057377,-1.282545,-0.341991,-0.205895,0.217876,0.019820,0.082742,-0.630427,-0.304003,...,0.001156,0.365408,-0.691744,-0.793582,0.838369,-0.462937,-0.849762,-1.000891,1.023401,0
19079,0.757283,-0.058451,0.220258,-0.341991,-0.205895,-1.940281,-1.962857,-0.368536,-1.525233,-0.887437,...,0.001156,0.376995,0.172480,-0.732797,0.835212,-0.038774,0.470492,-1.000891,0.090044,0
19080,0.757283,-0.059525,0.924477,-0.341991,0.545762,0.390529,0.284177,0.534020,0.935483,-0.498481,...,0.001156,0.371201,0.172480,-0.834106,0.844682,-0.258710,0.237112,0.999110,-0.843313,0
19081,1.341479,-0.059525,1.487852,-0.243601,-0.017981,-0.127429,0.085909,0.646840,-0.406726,-0.692959,...,0.001156,0.382788,0.172480,1.111037,1.176152,0.621036,1.447060,-1.000891,1.023401,0


In [11]:
# Computing the frequency of the unique values of the target variable before oversampling or undersampling the train dataset
df_train["TARGET_B"].value_counts()

0    72464
1     3865
Name: TARGET_B, dtype: int64

# Imbalanced data

In [12]:
# Building a logistic model using the imbalanced data
X_imbalanced = df_train.drop("TARGET_B", axis = 1)
y_imbalanced = df_train["TARGET_B"]

LR_1 = LogisticRegression()
LR_1.fit(X_imbalanced, y_imbalanced)
pred_1 = LR_1.predict(df_test.drop("TARGET_B", axis = 1))
print(classification_report(df_test["TARGET_B"], pred_1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18105
           1       0.00      0.00      0.00       978

    accuracy                           0.95     19083
   macro avg       0.47      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



# Oversampling

In [13]:
# Oversampling df_train with SMOTE method
smote = SMOTE()
x_sm, y_sm = smote.fit_resample(df_train.drop("TARGET_B", axis = 1), df_train["TARGET_B"])

# Creating a balanced dataframe
data_smote = pd.concat([x_sm, y_sm], axis = 1)
data_smote

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,1.337488,-0.027017,-1.276055,-0.361560,-0.207353,-0.125646,0.417324,-0.658816,0.034684,-0.696876,...,-0.188132,0.927558,-0.167504,1.095727,1.452087,0.154316,-0.096287,0.998678,-0.849602,0
1,-0.408079,-0.056202,0.318579,-0.361560,-0.207353,-0.650032,-0.444213,-0.263431,-1.109011,-0.111252,...,0.375719,-1.333416,-0.167504,-0.794295,-0.390478,0.112266,-1.393723,-1.001324,0.082301,0
2,-0.989934,-0.055160,0.365466,-0.141287,-0.207353,-0.475237,-0.179124,0.018987,-0.880272,-0.111252,...,-0.376083,0.921775,-0.533228,1.075404,-1.016079,-0.707178,-0.667969,0.998678,0.082301,0
3,-0.989934,-0.054118,-0.056519,-0.361560,-0.207353,0.311343,-0.709301,-0.150464,0.034684,-0.111252,...,-0.000182,-0.176908,-0.533228,-0.855263,-1.019192,-0.596038,-0.247390,0.998678,-0.849602,0
4,-0.699007,-0.056202,-1.276055,-0.361560,-0.207353,0.573536,1.080045,0.301405,-0.194055,-0.892084,...,-0.112952,0.407129,-0.021215,-0.733327,-0.701723,-0.136428,0.811036,0.998678,-0.849602,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144923,1.317015,-0.055160,-0.222487,-0.254007,0.378649,0.490239,1.509750,0.028261,0.917448,0.279164,...,-0.181960,0.909703,-0.155494,1.032978,1.427146,0.159379,1.051945,-1.001324,-0.849602,1
144924,-0.618971,-0.055629,-0.553583,-0.361560,-0.207353,-0.243577,-0.152503,0.175615,0.515134,-0.413866,...,0.310057,0.377787,0.801898,-0.836450,-0.640997,-0.409231,-0.923608,0.448464,-0.080484,1
144925,-1.571790,-0.055773,0.042237,0.600544,-0.124998,0.524683,-0.678132,0.683507,-0.012404,1.657060,...,-0.164917,-2.015125,-0.778491,-0.737510,-1.641039,-0.288354,-0.572747,0.586957,1.206046,1
144926,0.616908,0.491190,-0.441798,-0.309043,-0.207353,1.040052,-0.049651,-0.166929,1.551077,0.846700,...,-0.277754,0.393342,-0.341894,0.148520,0.531381,-0.440566,1.454464,-0.047654,0.039125,1


In [14]:
# Computing the frequency of the unique values of the target variable after oversampling the train dataframe
data_smote["TARGET_B"].value_counts()

0    72464
1    72464
Name: TARGET_B, dtype: int64

In [15]:
# Building a logistic model with data_smote
X_train_smote = data_smote.drop("TARGET_B", axis = 1)
y_train_smote = data_smote["TARGET_B"]

LR_2 = LogisticRegression()
LR_2.fit(X_train_smote, y_train_smote)
pred_2 = LR_2.predict(df_test.drop("TARGET_B", axis = 1))
print(classification_report(df_test["TARGET_B"], pred_2))

              precision    recall  f1-score   support

           0       0.96      0.61      0.75     18105
           1       0.07      0.51      0.12       978

    accuracy                           0.61     19083
   macro avg       0.51      0.56      0.43     19083
weighted avg       0.91      0.61      0.72     19083



# Undersampling

In [16]:
# Undersampling (removing random rows)
category_0_under = df_train[df_train["TARGET_B"] == 0]
category_1_under = df_train[df_train["TARGET_B"] == 1]
category_0_under = category_0_under.sample(len(category_1_under))

data_under = pd.concat([category_0_under, category_1_under], axis = 0)
data_under = data_under.reset_index(drop = True)
data_under

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,0.755632,-0.056202,-1.276055,-0.361560,-0.207353,-0.912226,-1.570838,1.769978,-0.422794,-0.111252,...,-0.338493,0.389781,-0.679518,-0.733327,0.546367,-0.546113,-0.142304,0.998678,1.014205,0
1,-0.408079,-0.054118,-0.009163,1.510766,1.792906,0.835730,1.080045,-0.715300,-0.651533,-0.111252,...,-0.225722,0.921775,-0.240649,1.075404,-0.393591,-0.350281,1.544248,-1.001324,0.082301,0
2,-1.571790,-0.055160,1.260074,0.299261,-0.207353,0.748332,1.345133,-0.828267,-0.422794,0.083956,...,0.375719,-1.905887,0.563944,-0.733327,-1.324211,0.976621,1.421422,0.998678,-0.849602,0
3,-0.989934,-0.055160,-0.150293,0.519534,-0.207353,-0.125646,2.272942,-1.393103,1.178380,-0.306460,...,-0.188132,-0.766727,-0.386938,-0.753649,-1.009854,-0.175635,1.175262,-1.001324,0.082301,0
4,1.046560,-0.027017,-1.276055,-0.361560,-0.207353,-1.087021,-1.305750,-0.545849,-1.109011,0.083956,...,-0.000182,0.360869,-0.167504,1.095727,0.857611,0.154316,-1.427973,0.998678,0.082301,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7725,1.046560,-0.056202,-0.525391,-0.031150,-0.207353,0.311343,-0.576757,0.244921,1.407119,-0.111252,...,-0.188132,0.407129,-0.167504,1.095727,1.153293,0.038679,0.843838,0.998678,1.946108,1
7726,0.173777,-0.055160,0.177917,0.739808,-0.207353,-0.650032,0.814956,-0.828267,3.237033,-0.696876,...,-0.188132,0.921775,-0.167504,1.075404,0.232010,-0.292813,0.857922,0.998678,0.082301,1
7727,0.173777,0.965276,1.818970,2.061450,-0.207353,-0.125646,-1.305750,0.979208,-0.651533,-0.501668,...,-0.000182,0.383999,0.198220,-0.814618,0.207111,-0.154049,0.876151,-1.001324,0.082301,1
7728,-1.280862,-0.056202,0.834338,-0.361560,-0.207353,0.398741,-0.775573,-0.263431,0.492163,-0.892084,...,-0.225722,0.921775,-0.533228,1.116050,-1.349111,-0.473139,-0.785039,0.998678,1.946108,1


In [17]:
# Computing the frequency of the unique values of the target variable after undersampling the train dataset
data_under["TARGET_B"].value_counts()

0    3865
1    3865
Name: TARGET_B, dtype: int64

In [19]:
# Building a logistic model with data_under
X_under = data_under.drop("TARGET_B", axis = 1)
y_under = data_under["TARGET_B"]

LR_3 = LogisticRegression()
LR_3.fit(X_under, y_under)
pred_3 = LR_3.predict(df_test.drop("TARGET_B", axis = 1))
print(classification_report(df_test["TARGET_B"], pred_3))

              precision    recall  f1-score   support

           0       0.96      0.58      0.72     18105
           1       0.07      0.58      0.12       978

    accuracy                           0.58     19083
   macro avg       0.52      0.58      0.42     19083
weighted avg       0.92      0.58      0.69     19083



In [20]:
# The best model was the one made with imbalanced data.
# However, this is not a reliable model because it predicted all the results as 0.
# Since the test data was also imbalanced (with a lot of zeros), the model made the right precision in 95% of the cases.
# The results of the remainig model were really poor.
# One reason for that may be the enormous number of variables incorporated into the models.