In [46]:
#Instructions
#In this lab, we will first take a look at the degree of imbalance in the data and correct it using the techniques we 
# learned in the class.
#Here is the list of steps to be followed (building a simple model without balancing the data):
#Import the required libraries and modules that you would need.
#Read that data into Python and call the dataframe donors.
#Check the datatypes of all the columns in the data.
#Check for null values in the dataframe. Replace the null values using the methods learned in class.
#Split the data into numerical and catagorical. Decide if any columns need their dtype changed.

In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [48]:
numerical = pd.read_csv('Numerical.csv')
categorical = pd.read_csv('categorical.csv') 

In [49]:
numerical.isnull().sum()

ODATEDW     0
TCODE       0
DOB         0
AGE         0
INCOME      0
           ..
AVGGIFT     0
CONTROLN    0
HPHONE_D    0
RFA_2F      0
CLUSTER2    0
Length: 322, dtype: int64

In [50]:
numerical.isna().sum()

ODATEDW     0
TCODE       0
DOB         0
AGE         0
INCOME      0
           ..
AVGGIFT     0
CONTROLN    0
HPHONE_D    0
RFA_2F      0
CLUSTER2    0
Length: 322, dtype: int64

In [51]:
numerical['WEALTH1'].isna().sum()

44732

In [52]:
numerical['WEALTH1'].value_counts()

9.0    7585
8.0    6793
7.0    6198
6.0    5825
5.0    5280
4.0    4810
3.0    4237
2.0    4085
1.0    3454
0.0    2413
Name: WEALTH1, dtype: int64

In [53]:
categorical.isna().sum()

OSOURCE       928
STATE           0
ZIP             0
CLUSTER         0
HOMEOWNR        0
GENDER       2957
DATASRCE        0
SOLIH       89212
VETERANS    84986
RFA_2           0
RFA_2R          0
GEOCODE2        0
DOMAIN_A        0
DOMAIN_B        0
dtype: int64

In [54]:
categorical.isnull().sum()

OSOURCE       928
STATE           0
ZIP             0
CLUSTER         0
HOMEOWNR        0
GENDER       2957
DATASRCE        0
SOLIH       89212
VETERANS    84986
RFA_2           0
RFA_2R          0
GEOCODE2        0
DOMAIN_A        0
DOMAIN_B        0
dtype: int64

In [55]:
categorical.drop(['SOLIH'], axis=1,inplace=True)

In [56]:
categorical.drop(['VETERANS'], axis=1,inplace=True)

In [57]:
categorical['GENDER'].fillna('U', inplace=True)

In [58]:
categorical.drop(['OSOURCE'], axis=1,inplace=True)

In [59]:
categorical.isnull().sum()

STATE       0
ZIP         0
CLUSTER     0
HOMEOWNR    0
GENDER      0
DATASRCE    0
RFA_2       0
RFA_2R      0
GEOCODE2    0
DOMAIN_A    0
DOMAIN_B    0
dtype: int64

In [60]:
numerical['WEALTH1'].fillna('0.0', inplace=True)

In [62]:
#Scale the features either by using normalizer or a standard scaler. (train_num, test_num)
scaler = MinMaxScaler()

for col in numerical.columns:
    numerical[col] = scaler.fit_transform(numerical[[col]])

numerical.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,LASTGIFT,LASTDATE,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.426523,0.0,0.382286,0.608247,0.0,0.0,0.0,0.0,0.393939,0.343434,...,0.01,0.045226,0.927939,0.71939,0.003676,0.006465,0.498045,0.0,1.0,0.622951
1,0.784946,1.4e-05,0.535736,0.463918,0.857143,1.0,0.06639,0.0,0.151515,0.555556,...,0.025,0.045226,0.969489,0.920514,0.016544,0.014399,0.77451,0.0,0.333333,0.0
2,0.498208,1.4e-05,0.0,0.624862,0.428571,0.111111,0.008299,0.0,0.20202,0.292929,...,0.005,0.045226,0.937311,0.758731,0.011029,0.006204,0.078617,1.0,1.0,0.967213
3,0.283154,0.0,0.288465,0.71134,0.142857,0.444444,0.008299,0.0,0.232323,0.141414,...,0.01,0.045226,0.906175,0.602168,0.008272,0.005534,0.899764,1.0,1.0,0.655738
4,0.21147,0.0,0.206076,0.793814,0.428571,0.222222,0.248963,0.010101,0.282828,0.090909,...,0.015,0.492462,0.822972,0.318747,0.012868,0.005586,0.037079,1.0,0.333333,0.409836


In [63]:
#Encode the categorical features using One-Hot Encoding or Ordinal Encoding. (train_cat, test_cat)

In [64]:
categorical.nunique()

STATE          12
ZIP         19938
CLUSTER        53
HOMEOWNR        2
GENDER          6
DATASRCE        4
RFA_2          14
RFA_2R          1
GEOCODE2        5
DOMAIN_A        5
DOMAIN_B        4
dtype: int64

In [66]:
categorical.drop(['ZIP'], axis=1,inplace=True)

In [67]:
one_hot_names = []
for col in categorical.columns:
    col_uniques = sorted(categorical[col].astype(str).unique())
    for unique in col_uniques:
        one_hot_names.append(col+"_"+unique)
        
categorical = pd.DataFrame(OneHotEncoder().fit_transform(categorical.astype(str)).toarray())
categorical.columns = one_hot_names
categorical.head()

Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_OTHER,STATE_TX,...,GEOCODE2_U,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#Concatenate numerical and categorical back together again for your X dataframe. Designate the Target as y.

In [68]:
targets = pd.read_csv('targets.csv')

In [69]:
targets=targets.drop(columns="TARGET_D")

In [82]:
X = pd.concat([numerical, categorical], axis=1)
y = targets

In [83]:
columns_with_nan = X.columns[X.isnull().any()]
print(columns_with_nan)

Index(['WEALTH2', 'MSA', 'ADI', 'DMA', 'NEXTDATE', 'TIMELAG'], dtype='object')


In [84]:
X.drop(['WEALTH2'], axis=1,inplace=True)
X.drop(['MSA'], axis=1,inplace=True)
X.drop(['ADI'], axis=1,inplace=True)
X.drop(['DMA'], axis=1,inplace=True)
X.drop(['NEXTDATE'], axis=1,inplace=True)
X.drop(['TIMELAG'], axis=1,inplace=True)

In [None]:
#Split the data into a training set and a test set.
#Split further into train_num and train_cat. Also test_num and test_cat.

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,GEOCODE2_U,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
85225,0.928315,0.000389,0.0,0.624862,0.0,0.0,0.0,0.0,0.292929,0.363636,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
70004,0.498208,0.0,0.350257,0.649485,0.571429,0.0,0.0,0.0,0.232323,0.232323,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
88133,0.354839,1.4e-05,0.360556,0.639175,1.0,0.666667,0.008299,0.0,0.252525,0.272727,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
79106,0.354839,2.8e-05,0.267868,0.731959,0.285714,0.0,0.0,0.0,0.343434,0.191919,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
35476,0.426523,0.0,0.0,0.624862,0.142857,0.0,0.0,0.0,0.373737,0.464646,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#Fit a logistic regression model on the training data.
#Check the accuracy on the test data.

In [86]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [87]:
accuracy = model.score(X_test, y_test)
accuracy

0.9487501965099827

In [88]:
targets.value_counts()

TARGET_B
0           90569
1            4843
dtype: int64

In [None]:
#Managing imbalance in the dataset
#Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
#Each time fit the model and see how the accuracy of the model has changed.

In [89]:
# OVERSAMPLING
smote = SMOTE()

x_resampled,y_resampled=smote.fit_resample(X,targets)
y_resampled.value_counts()

TARGET_B
0           90569
1           90569
dtype: int64

In [90]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6148007066357514

In [91]:
# UNDERSAMPLING
RUS=RandomUnderSampler(random_state=0)
x_resampled,y_resampled=RUS.fit_resample(X,targets)

y_resampled.value_counts()

TARGET_B
0           4843
1           4843
dtype: int64

In [92]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5768833849329206

In [93]:
# TOMEKLINKS
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl=tl.fit_resample(X,y)
y_tl.value_counts()

TARGET_B
0           88834
1            4843
dtype: int64

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9450789923142613