In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score,recall_score,f1_score

# Reading clean data

In [2]:
numerical = pd.read_csv('numerical7_02.csv')
numerical.head()

Unnamed: 0.1,Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,...,LASTGIFT,LASTDATE,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,8901,0,3712,60.0,5.0,6.0,0,0,39,...,10.0,9512,8911,9003.0,4.0,7.741935,95515,0,4,39.0
1,1,9401,1,5202,46.0,6.0,9.0,16,0,15,...,25.0,9512,9310,9504.0,18.0,15.666667,148535,0,2,1.0
2,2,9001,1,0,61.611649,3.0,1.0,2,0,20,...,5.0,9512,9001,9101.0,12.0,7.481481,15078,1,4,60.0
3,3,8701,0,2801,70.0,1.0,4.0,2,0,23,...,10.0,9512,8702,8711.0,9.0,6.8125,172556,1,4,41.0
4,4,8601,0,2001,78.0,3.0,2.0,60,1,28,...,15.0,9601,7903,8005.0,14.0,6.864865,7112,1,2,26.0


In [3]:
categorical = pd.read_csv('categorical7_02.csv')
categorical.head()

Unnamed: 0.1,Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,SOLIH,VETERANS,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B
0,0,IL,36,U,F,F,,,L,E,C,T,2
1,1,CA,14,H,M,M,,,L,G,A,S,1
2,2,NC,43,U,M,M,,,L,E,C,R,2
3,3,CA,44,U,F,F,,,L,E,C,R,2
4,4,FL,16,H,F,F,12.0,,L,F,A,S,2


In [4]:
y = pd.read_csv('target7_02.csv')
y = y.drop(['TARGET_D'],axis=1)
y.head()

Unnamed: 0.1,Unnamed: 0,TARGET_B
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [5]:
# 1. Read that data into Python and call the dataframe donors.

# for some reason I have more rows in categorical than numerical. (Probably because I
# deleted some nans). I will do a merge on the old index (unnamed:0) so that everything
# matches (and so i drop those same rows now from the other tables)
donors = pd.merge(left = numerical, right = categorical, how = 'inner', left_on = 'Unnamed: 0', right_on = 'Unnamed: 0')
donors = pd.merge(left = donors, right = y, how = 'inner', left_on = 'Unnamed: 0', right_on = 'Unnamed: 0')
donors = donors.drop(['Unnamed: 0'],axis=1)

donors.head()

# Dropping solih and veterans because of NANs (it raises values for logistisc regressor)
donors = donors.drop(['SOLIH','VETERANS','NEXTDATE'],axis=1)

In [6]:
donors.dtypes

ODATEDW       int64
TCODE         int64
DOB           int64
AGE         float64
INCOME      float64
             ...   
RFA_2A       object
GEOCODE2     object
DOMAIN_A     object
DOMAIN_B      int64
TARGET_B      int64
Length: 333, dtype: object

In [7]:
# Check for null values in the dataframe. Replace the null values using the methods learned in class.

# Done in previous lab

# Separate Y and X
X = donors.drop(['TARGET_B'],axis=1)
y = donors['TARGET_B']

In [8]:
# Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Split the data into numerical and catagorical. Decide if any columns need their dtype changed.
numerical = X_train.select_dtypes(np.number)
categorical = X_train.select_dtypes(object)

In [10]:
# Scale the features either by using normalizer or a standard scaler.
transformer = MinMaxScaler().fit(numerical)
X_normalized = transformer.transform(numerical)
X_normalized = pd.DataFrame(X_normalized, columns=numerical.columns)
X_normalized.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DOMAIN_B
0,0.354839,2.8e-05,0.206222,0.793814,0.333333,0.666667,0.0,0.0,0.383838,0.454545,...,0.01005,0.916484,0.005515,0.01193,0.911642,0.0,0.0,0.377049,0.307692,0.333333
1,0.21147,0.0,0.391533,0.608247,0.166667,0.666667,0.0,0.0,0.30303,0.282828,...,0.030151,0.916589,0.011029,0.00835,0.424868,1.0,0.0,0.95082,0.846154,0.333333
2,0.426523,0.0,0.473939,0.525773,0.833333,0.666667,0.0,0.0,0.313131,0.323232,...,0.020101,0.927835,0.016544,0.009426,0.130089,0.0,0.0,0.622951,0.923077,0.333333
3,0.856631,0.0,0.0,0.624862,0.666667,0.444444,0.004149,0.0,0.343434,0.171717,...,0.020101,0.990003,0.008272,0.023745,0.113131,1.0,0.0,0.606557,0.673077,0.333333
4,0.641577,5.6e-05,0.412134,0.587629,1.0,1.0,0.004149,0.0,0.424242,0.30303,...,0.994975,0.95845,0.030331,0.01248,0.983934,0.0,0.0,0.098361,0.019231,0.0


In [11]:
# Encode the categorical features using One-Hot Encoding or Ordinal Encoding
X_cat = categorical.copy()
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_cat)
encoded = encoder.transform(X_cat).toarray()
onehot_encoded = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_cat.columns))
onehot_encoded.head()

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,GEOCODE2_U,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# concatenating back
X_train_treated = pd.concat([X_normalized,onehot_encoded],axis=1)
X_train_treated.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,GEOCODE2_U,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.354839,2.8e-05,0.206222,0.793814,0.333333,0.666667,0.0,0.0,0.383838,0.454545,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.21147,0.0,0.391533,0.608247,0.166667,0.666667,0.0,0.0,0.30303,0.282828,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.426523,0.0,0.473939,0.525773,0.833333,0.666667,0.0,0.0,0.313131,0.323232,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.856631,0.0,0.0,0.624862,0.666667,0.444444,0.004149,0.0,0.343434,0.171717,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641577,5.6e-05,0.412134,0.587629,1.0,1.0,0.004149,0.0,0.424242,0.30303,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Treating X_test:
X_test_num = X_test.select_dtypes(np.number)
numT_treated = transformer.transform(X_test_num)
numT_treated = pd.DataFrame(numT_treated, columns=X_test_num.columns)

# Encode the categorical features using One-Hot Encoding or Ordinal Encoding
X_test_cat = X_test.select_dtypes(object)
encoded = encoder.transform(X_test_cat).toarray()
onehot_encoded = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_test_cat.columns))

# concatenating back
X_test_treated = pd.concat([numT_treated,onehot_encoded],axis=1)

# Fit a logistic regression model on the training data.
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr',max_iter=1000)
classification.fit(X_train_treated, y_train)


predictions = classification.predict(X_test_treated)

confusion_matrix(y_test, predictions)

array([[18123,     0],
       [  933,     0]])

In [14]:
# Check the accuracy on the test data.
print('The accuracy is ', 18123/(18123+933))
# It's accurate becauase most people don't donate, but the model its awful because it
# doesn't give us any insight about whose customers are more prone to donate.
# We need to balance the data.

The accuracy is  0.9510390428211587


In [15]:
#Check for the imbalance.
print('We have %.1f%% rows where target = 0, and %.1f%% rows where target = 1' %(100*len(y_train[y_train==0])/len(y_train),100*len(y_train[y_train==1])/len(y_train)))

We have 94.9% rows where target = 0, and 5.1% rows where target = 1


In [16]:
# Use the resampling strategies used in class for upsampling and downsampling 
# to create a balance between the two classes. Each time fit the model and 
# see how the accuracy of the model has changed.

In [17]:
X_train_treated

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,GEOCODE2_U,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.354839,0.000028,0.206222,0.793814,0.333333,0.666667,0.000000,0.0,0.383838,0.454545,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.211470,0.000000,0.391533,0.608247,0.166667,0.666667,0.000000,0.0,0.303030,0.282828,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.426523,0.000000,0.473939,0.525773,0.833333,0.666667,0.000000,0.0,0.313131,0.323232,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.856631,0.000000,0.000000,0.624862,0.666667,0.444444,0.004149,0.0,0.343434,0.171717,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641577,0.000056,0.412134,0.587629,1.000000,1.000000,0.004149,0.0,0.424242,0.303030,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76219,0.498208,0.000000,0.576947,0.422680,1.000000,1.000000,0.020747,0.0,0.373737,0.525253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
76220,0.784946,0.013916,0.453337,0.546392,0.833333,0.555556,0.004149,0.0,0.343434,0.353535,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
76221,0.713262,0.000000,0.000000,0.624862,0.666667,0.666667,0.000000,0.0,0.131313,0.404040,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
76222,0.641577,0.000000,0.576947,0.422680,0.000000,0.666667,0.000000,0.0,0.383838,0.222222,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Resetting index
y_train = y_train.reset_index(drop=True)

In [19]:
X_train0 = X_train_treated[y_train == 0]
X_train1 = X_train_treated[y_train == 1]

## Undersampling

In [20]:
X_train0_under = resample(X_train0, 
                                   replace=False, 
                                   n_samples = len(X_train1))

In [21]:
train_downsampled = pd.concat([X_train0_under, X_train1], axis=0).reset_index(drop=True)
y_downsample = pd.Series(len(X_train0_under)*[0]+len(X_train1)*[1])

In [22]:
# Fit a logistic regression model on the training data.
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr',max_iter=1000)
classification.fit(train_downsampled, y_downsample)

predictions = classification.predict(X_test_treated)

confusion_matrix(y_test, predictions)

array([[10916,  7207],
       [  426,   507]])

In [23]:
classification.score(X_test_treated, y_test)

0.599443744752309

# Oversampling

In [24]:
X_train1_over = resample(X_train1, 
                                   replace=True, 
                                   n_samples = len(X_train0))

In [25]:
train_oversample = pd.concat([X_train0, X_train1_over], axis=0).reset_index(drop=True)
y_oversample = pd.Series(len(X_train0)*[0]+len(X_train1_over)*[1])

In [26]:
# Fit a logistic regression model on the training data.
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr',max_iter=250)
classification.fit(train_oversample, y_oversample)

predictions = classification.predict(X_test_treated)
confusion_matrix(y_test, predictions)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[11257,  6866],
       [  427,   506]])

In [27]:
classification.score(X_test_treated, y_test)

0.6172858942065491

# SMOTE

In [28]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_treated,y_train)
LR = LogisticRegression(random_state=0, solver='lbfgs') 
LR.fit(X_train_SMOTE, y_train_SMOTE) 
pred = LR.predict(X_test_treated)
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.06718834322719913
recall:  0.5337620578778135
f1:  0.11935290593169563


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
confusion_matrix(y_test, predictions)

array([[11257,  6866],
       [  427,   506]])

In [30]:
# It gives the same results as oversampling