In [1]:
# Importing libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Importing data (excluding TARGET_D)
donors = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Downloads_Ironhack\learningSet.txt")
donors = donors.drop("TARGET_D", axis = 1)
donors

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_B,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,1,1,L,4,F,X,X,X,11.0,A


In [3]:
# Creating a new dataframe with numerical data and dropping null values 
# I decided to perform the model with only numerical data since there are already hundreds of numerical variables
numerical = donors.select_dtypes(include=np.number)
numerical = numerical.dropna(axis = 1)
numerical

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,TARGET_B,HPHONE_D,RFA_2F
0,8901,0,3712,0,0,39,34,18,10,2,...,12.0,9402,10.0,9512,8911,7.741935,95515,0,0,4
1,9401,1,5202,16,0,15,55,11,6,2,...,25.0,9512,25.0,9512,9310,15.666667,148535,0,0,2
2,9001,1,0,2,0,20,29,33,6,8,...,16.0,9207,5.0,9512,9001,7.481481,15078,0,1,4
3,8701,0,2801,2,0,23,14,31,3,0,...,11.0,9411,10.0,9512,8702,6.812500,172556,0,1,4
4,8601,0,2001,60,1,28,9,53,26,3,...,15.0,9601,15.0,9601,7903,6.864865,7112,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,1,0,0,14,36,47,11,7,8,...,25.0,9602,25.0,9602,9602,25.000000,184568,0,0,1
95408,9601,1,5001,1,0,31,43,19,4,1,...,20.0,9603,20.0,9603,9603,20.000000,122706,0,1,1
95409,9501,1,3801,0,0,18,46,20,7,23,...,10.0,9501,10.0,9610,9410,8.285714,189641,0,1,3
95410,8601,0,4005,0,0,28,35,20,9,1,...,21.0,9608,18.0,9701,8612,12.146341,4693,1,1,4


In [4]:
# Computing the frequency of the unique values from the the target variable
numerical["TARGET_B"].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

# Oversampling

In [5]:
# Oversampling with SMOTE method
smote = SMOTE()
X = numerical.drop("TARGET_B", axis = 1)
y = numerical["TARGET_B"]
x_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

# Standardizing independent variables
scaler = StandardScaler()
stand_data_over = scaler.fit_transform(x_sm)
stand_data_over = pd.DataFrame(stand_data_over, columns = x_sm.columns)

# Creating a balanced dataframe
data_smote = pd.concat([stand_data_over, y_sm], axis = 1)
data_smote

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,-0.675493,-0.061296,0.495312,-0.387301,-0.209754,0.805418,0.329545,-0.885342,0.850820,-0.497621,...,-0.335830,-0.184816,-0.479827,-0.848953,-0.678085,-0.472733,-0.028829,-0.801681,2.113660,0
1,0.885931,-0.060170,1.228469,1.478564,-0.209754,-1.445428,1.832152,-1.307672,-0.142394,-0.497621,...,0.279256,0.482803,0.660072,-0.848953,0.661066,0.292301,0.905196,-0.801681,0.103020,0
2,-0.363208,-0.060170,-1.331186,-0.154068,-0.209754,-0.976502,-0.028219,0.019652,-0.142394,0.764369,...,-0.146573,-1.368322,-0.859793,-0.848953,-0.376021,-0.497877,-1.445844,1.247379,2.113660,0
3,-1.300063,-0.061296,0.047052,-0.154068,-0.209754,-0.695146,-1.101511,-0.101014,-0.887305,-0.918284,...,-0.383144,-0.130192,-0.479827,-0.848953,-1.379545,-0.562458,1.328361,1.247379,2.113660,0
4,-1.612347,-0.061296,-0.346590,6.609691,0.020067,-0.226220,-1.459275,1.226309,4.823675,-0.287289,...,-0.193887,1.022968,-0.099861,1.045001,-4.061203,-0.557403,-1.586177,1.247379,0.103020,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181133,-1.462451,-0.059045,-0.343637,0.895481,-0.209754,-1.070287,-0.385983,0.019652,-0.142394,-0.918284,...,-0.627333,-0.894919,-0.884262,2.811273,-1.530577,-0.816627,-1.578337,-0.801681,2.113660,1
181134,1.076425,-0.061296,0.225175,-0.270684,-0.209754,-2.570851,-1.960144,-1.790335,-1.383912,-0.918284,...,0.018673,0.531357,0.241540,-0.678710,0.906073,0.175684,-0.190759,-0.801681,-0.902301,1
181135,0.483084,-0.044415,-1.331186,-0.387301,-0.209754,-0.038649,-0.099772,-0.764676,0.354213,-0.287289,...,-0.275195,0.015470,-1.011225,1.130122,0.553665,-0.377214,1.546787,-0.801681,-0.902301,1
181136,-1.334414,-0.060170,1.311134,0.079165,-0.209754,-2.477066,-1.387722,-0.221680,-0.639001,-0.707952,...,-0.088387,0.112578,-0.310380,-0.019018,-1.389614,-0.422650,1.248294,-0.801681,-0.902301,1


In [6]:
# Building a logistic model with data_smote
X = data_smote.drop("TARGET_B",axis = 1)
y = data_smote["TARGET_B"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18054
           1       1.00      0.94      0.97     18174

    accuracy                           0.97     36228
   macro avg       0.97      0.97      0.97     36228
weighted avg       0.97      0.97      0.97     36228



# Undersampling

In [17]:
# Undersampling (removing random rows)
category_0_under = numerical[numerical["TARGET_B"] == 0]
category_1_under = numerical[numerical["TARGET_B"] == 1]
category_0_under = category_0_under.sample(len(category_1_under))
data_under = pd.concat([category_0_under, category_1_under], axis = 0)
data_under = data_under.reset_index(drop = True)

# Standardizing independent variables
scaler = StandardScaler() # This is the standardization process
X_under = data_under.drop(["TARGET_B"], axis = 1)
stand_data_under = scaler.fit_transform(X_under) # Fit it to the data
stand_data_under = pd.DataFrame(stand_data_under, columns = X_under.columns)

# Creating a balanced dataframe
data_under = pd.concat([stand_data_under, data_under["TARGET_B"]], axis = 1)
data_under

Unnamed: 0,ODATEDW,TCODE,DOB,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,...,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,TARGET_B
0,-0.050497,-0.065617,0.527238,-0.384030,-0.200813,0.980796,0.753263,-0.212164,0.050550,0.291976,...,0.031195,0.394535,0.247390,-0.938617,-0.016425,-0.350508,-1.324092,-0.998762,-0.94887,0
1,1.109332,-0.064715,0.718355,0.723258,-0.200813,0.550098,-1.364485,0.892687,-0.639392,0.094526,...,-0.444097,0.422297,-0.526155,0.962235,1.225803,-0.462130,0.253962,1.001240,0.83505,0
2,1.109332,-0.040374,1.817274,-0.384030,-0.200813,-0.397437,0.025287,-0.046437,1.200453,0.094526,...,-0.170804,0.938666,-0.051480,0.981831,0.921520,-0.276758,0.744344,-0.998762,-0.05691,0
3,-0.050497,-0.064715,-0.428343,-0.162572,-0.200813,0.550098,-0.702689,0.340261,0.510511,-0.892724,...,0.031195,0.377878,-1.088732,1.138603,-0.019562,-0.254832,1.569166,1.001240,-0.94887,0
4,1.109332,-0.063814,1.435041,-0.273301,1.302629,0.205540,1.282700,-1.261773,-0.179430,-0.497824,...,-0.158922,0.944218,-0.033899,1.001428,1.213256,0.016250,-1.135898,-0.998762,-0.94887,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9681,-1.500284,-0.063814,1.245358,-0.384030,-0.200813,1.239215,-0.107072,0.229776,0.510511,-0.497824,...,-0.111392,0.933114,0.036423,0.962235,-1.591169,-0.468285,1.006842,-0.998762,-0.94887,1
9682,0.819375,-0.065617,0.957250,-0.273301,-0.012883,0.119400,0.885622,-0.488377,0.050550,0.094526,...,-0.206451,-0.171806,-0.104221,-0.781845,0.610963,-0.111318,1.302330,1.001240,-0.94887,1
9683,-1.500284,-0.065617,-0.758497,-0.384030,-0.012883,0.119400,-0.570329,-0.377892,0.510511,-0.695274,...,0.268842,0.433401,0.247390,0.981831,-1.268064,0.150196,-0.328181,-0.998762,0.83505,1
9684,-0.340454,-0.065617,0.622797,-0.384030,-0.200813,-0.569717,1.084161,-0.709347,-0.179430,-0.695274,...,0.031195,-0.671518,0.247390,0.962235,-0.352078,-0.119291,-0.225083,-0.998762,-0.94887,1


In [18]:
# Building a logistic model with data_under
X = data_under.drop("TARGET_B",axis = 1)
y = data_under["TARGET_B"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.59      0.58      0.58       985
           1       0.57      0.58      0.58       953

    accuracy                           0.58      1938
   macro avg       0.58      0.58      0.58      1938
weighted avg       0.58      0.58      0.58      1938



In [None]:
# Regarding these two tests, the oversample method was the best