In [24]:
# Import the required libraries and modules that you would need.

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Concatenate numerical and categorical back together again for your X dataframe. 
# Designate the Target as y.

numerical=pd.read_csv("learningSet_numerical_clean.csv")
categorical=pd.read_csv("learningSet_categorical_clean.csv")
target=pd.read_csv("learningSet_Y.csv")

In [3]:
target=target.drop(columns="TARGET_D")

In [4]:
numerical.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,8901,0,3712,60.0,0.0,9.0,0,0,39,34,...,9402,10.0,9512,8911,4.0,7.741935,95515,0,4,39.0
1,9401,1,5202,46.0,6.0,9.0,16,0,15,55,...,9512,25.0,9512,9310,18.0,15.666667,148535,0,2,1.0
2,9001,1,0,61.611649,3.0,1.0,2,0,20,29,...,9207,5.0,9512,9001,12.0,7.481481,15078,1,4,60.0
3,8701,0,2801,70.0,1.0,4.0,2,0,23,14,...,9411,10.0,9512,8702,9.0,6.8125,172556,1,4,41.0
4,8601,0,2001,78.0,3.0,2.0,60,1,28,9,...,9601,15.0,9601,7903,14.0,6.864865,7112,1,2,26.0


In [5]:
# Scale the features either by using normalizer or a standard scaler.

scaler = MinMaxScaler()

for col in numerical.columns:
    numerical[col] = scaler.fit_transform(numerical[[col]])

numerical

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.426523,0.000000,0.382286,0.608247,0.000000,1.000000,0.000000,0.000000,0.393939,0.343434,...,0.863139,0.010,0.045226,0.927939,0.003676,0.006465,0.498045,0.0,1.000000,0.622951
1,0.784946,0.000014,0.535736,0.463918,0.857143,1.000000,0.066390,0.000000,0.151515,0.555556,...,0.913321,0.025,0.045226,0.969489,0.016544,0.014399,0.774510,0.0,0.333333,0.000000
2,0.498208,0.000014,0.000000,0.624862,0.428571,0.111111,0.008299,0.000000,0.202020,0.292929,...,0.774179,0.005,0.045226,0.937311,0.011029,0.006204,0.078617,1.0,1.000000,0.967213
3,0.283154,0.000000,0.288465,0.711340,0.142857,0.444444,0.008299,0.000000,0.232323,0.141414,...,0.867245,0.010,0.045226,0.906175,0.008272,0.005534,0.899764,1.0,1.000000,0.655738
4,0.211470,0.000000,0.206076,0.793814,0.428571,0.222222,0.248963,0.010101,0.282828,0.090909,...,0.953923,0.015,0.492462,0.822972,0.012868,0.005586,0.037079,1.0,0.333333,0.409836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.928315,0.000014,0.000000,0.624862,0.000000,1.000000,0.000000,0.141414,0.363636,0.474747,...,0.954380,0.025,0.497487,0.999896,0.007353,0.023745,0.962399,0.0,0.000000,0.180328
95408,0.928315,0.000014,0.515036,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,...,0.954836,0.020,0.502513,1.000000,0.007353,0.018738,0.639828,1.0,0.000000,0.016393
95409,0.856631,0.000014,0.391452,0.608247,0.000000,1.000000,0.000000,0.000000,0.181818,0.464646,...,0.908303,0.010,0.537688,0.979902,0.002757,0.007009,0.988852,1.0,0.666667,0.540984
95410,0.211470,0.000000,0.412461,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,...,0.957117,0.018,0.994975,0.896803,0.003676,0.010875,0.024466,1.0,1.000000,0.163934


In [6]:
# Encode the categorical features using One-Hot Encoding or Ordinal Encoding. 

categorical.nunique()

STATE          12
ZIP         19938
CLUSTER        53
HOMEOWNR        2
DATASRCE        4
RFA_2          14
RFA_2R          1
RFA_2A          4
GEOCODE2        5
DOMAIN_A        5
DOMAIN_B        4
dtype: int64

In [7]:
categorical=categorical.drop(columns="ZIP")
categorical.nunique()

STATE       12
CLUSTER     53
HOMEOWNR     2
DATASRCE     4
RFA_2       14
RFA_2R       1
RFA_2A       4
GEOCODE2     5
DOMAIN_A     5
DOMAIN_B     4
dtype: int64

In [8]:
one_hot_names = []
for col in categorical.columns:
    col_uniques = sorted(categorical[col].astype(str).unique())
    for unique in col_uniques:
        one_hot_names.append(col+"_"+unique)
        
categorical = pd.DataFrame(OneHotEncoder().fit_transform(categorical.astype(str)).toarray())
categorical.columns = one_hot_names
categorical

Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_OTHER,STATE_TX,...,GEOCODE2_U,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
95408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95409,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95410,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
# re-concatenate

data_x=pd.DataFrame(np.concatenate((numerical,categorical),axis=1)).reset_index(drop=True)
data_x.columns = list(numerical.columns) + list(categorical.columns)
data_x

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,GEOCODE2_U,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.426523,0.000000,0.382286,0.608247,0.000000,1.000000,0.000000,0.000000,0.393939,0.343434,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.784946,0.000014,0.535736,0.463918,0.857143,1.000000,0.066390,0.000000,0.151515,0.555556,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.498208,0.000014,0.000000,0.624862,0.428571,0.111111,0.008299,0.000000,0.202020,0.292929,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.283154,0.000000,0.288465,0.711340,0.142857,0.444444,0.008299,0.000000,0.232323,0.141414,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.211470,0.000000,0.206076,0.793814,0.428571,0.222222,0.248963,0.010101,0.282828,0.090909,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.928315,0.000014,0.000000,0.624862,0.000000,1.000000,0.000000,0.141414,0.363636,0.474747,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
95408,0.928315,0.000014,0.515036,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95409,0.856631,0.000014,0.391452,0.608247,0.000000,1.000000,0.000000,0.000000,0.181818,0.464646,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95410,0.211470,0.000000,0.412461,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
## apply train-test split and Logistic regression model

X_train, X_test, y_train, y_test = train_test_split(data_x, target, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
## checking the accuracy on the test data

accuracy = model.score(X_test, y_test)
accuracy

0.9487501965099827

##### Managing imbalance in the dataset

In [16]:
target.value_counts()

TARGET_B
0           90569
1            4843
dtype: int64

In [25]:
## undersampling

rus=RandomUnderSampler(random_state=0)
x_resampled,y_resampled=rus.fit_resample(data_x,target)

y_resampled.value_counts()

TARGET_B
0           4843
1           4843
dtype: int64

In [26]:
# checking accuracy
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5799793601651186

In [28]:
### oversampling

smote = SMOTE()

x_resampled,y_resampled=smote.fit_resample(data_x,target)
y_resampled.value_counts()

TARGET_B
0           90569
1           90569
dtype: int64

In [29]:
# checking accuracy
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6070718781053329