### ETL & Logistic Regression

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score

In [26]:
numerical = pd.read_csv('numerical.csv')
numerical = numerical.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
categorical = pd.read_csv('categorical.csv')
categorical = categorical.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
targets = pd.read_csv('target.csv')
targets = targets.drop(columns=['Unnamed: 0'])

In [27]:
numerical.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,8901,0,3712,60.0,0.0,9.0,0,0,39,34,...,9402,10.0,9512,8911,4.0,7.741935,95515,0,4,39.0
1,9401,1,5202,46.0,6.0,9.0,16,0,15,55,...,9512,25.0,9512,9310,18.0,15.666667,148535,0,2,1.0
2,9001,1,0,61.611649,3.0,1.0,2,0,20,29,...,9207,5.0,9512,9001,12.0,7.481481,15078,1,4,60.0
3,8701,0,2801,70.0,1.0,4.0,2,0,23,14,...,9411,10.0,9512,8702,9.0,6.8125,172556,1,4,41.0
4,8601,0,2001,78.0,3.0,2.0,60,1,28,9,...,9601,15.0,9601,7903,14.0,6.864865,7112,1,2,26.0


In [28]:
numerical.isnull().sum().sum()

0

In [29]:
numerical_scaler = StandardScaler()

for col in numerical.columns:
    
    numerical[col] = numerical_scaler.fit_transform(numerical[[col]])

In [30]:
numerical

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,-0.699843,-0.056847,0.463551,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,0.745798,0.284659,...,-0.230581,-0.523992,-0.733220,-0.701177,-0.525458,-0.520509,-0.004760,-1.001238,1.948226,0.398135
1,0.755960,-0.055799,1.162350,-1.080356e+00,1.295872,0.717411,1.362283,-0.206977,-1.346527,1.675602,...,0.405737,0.550771,-0.733220,0.544171,1.275844,0.215310,0.954282,-1.001238,0.083847,-1.628365
2,-0.408683,-0.055799,-1.277349,-4.917093e-16,-0.008366,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,...,-1.358599,-0.882247,-0.733220,-0.420271,0.503857,-0.544692,-1.459730,0.998764,1.948226,1.518043
3,-1.282165,-0.056847,0.036299,5.804901e-01,-0.877858,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,...,-0.178519,-0.523992,-0.733220,-1.353502,0.117864,-0.606808,1.388782,0.998764,1.948226,0.504793
4,-1.573325,-0.056847,-0.338895,1.134106e+00,-0.008366,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,...,0.920576,-0.165738,1.074423,-3.847319,0.761186,-0.601946,-1.603822,0.998764,0.083847,-0.295141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1.338282,-0.055799,-1.277349,-4.917093e-16,-1.312604,0.717411,-0.356881,2.556819,0.484257,1.145719,...,0.926361,0.550771,1.094733,1.455554,-0.010800,1.081920,1.606058,-1.001238,-0.848342,-1.041747
95408,1.338282,-0.055799,1.068082,-9.419524e-01,1.730618,0.717411,-0.249433,-0.206977,0.048356,0.880777,...,0.932145,0.192517,1.115044,1.458675,-0.010800,0.617665,0.487079,0.998764,-0.848342,-1.575036
95409,1.047121,-0.055799,0.505291,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,-1.084987,1.079483,...,0.342105,-0.523992,1.257218,0.856289,-0.654122,-0.470019,1.697820,0.998764,1.016037,0.131490
95410,-1.573325,-0.056847,0.600966,-2.499331e-01,1.730618,0.717411,-0.356881,-0.206977,-0.213185,0.350894,...,0.961069,0.049215,3.105481,-1.634408,-0.525458,-0.111555,-1.647577,0.998764,1.948226,-1.095076


In [31]:
categorical.head()

Unnamed: 0,STATE,ZIP,CLUSTER,HOMEOWNR,DATASRCE,RFA_2,RFA_2R,GEOCODE2,DOMAIN_A,DOMAIN_B
0,IL,61081,36,U,1,L4E,L,C,T,2
1,CA,91326,14,H,3,L2G,L,A,S,1
2,NC,27017,43,U,3,L4E,L,C,R,2
3,CA,95953,44,U,3,L4E,L,C,R,2
4,FL,33176,16,H,3,L2F,L,A,S,2


In [32]:
categorical.isnull().sum().sum()

0

In [33]:
categorical = categorical.drop(['ZIP', 'CLUSTER', 'DOMAIN_B'], axis=1)

In [34]:
categorical['DATASRCE'] = categorical['DATASRCE'].astype(object)

In [35]:
categorical.dtypes

STATE       object
HOMEOWNR    object
DATASRCE    object
RFA_2       object
RFA_2R      object
GEOCODE2    object
DOMAIN_A    object
dtype: object

In [36]:
onehotencoder = []

for col in categorical.columns:
    
    col_uniques = sorted(categorical[col].astype(str).unique())
    
    for unique in col_uniques:
        
        onehotencoder.append(col+"_"+unique)

In [37]:
categorical = pd.DataFrame(OneHotEncoder().fit_transform(categorical.astype(str)).toarray())
categorical.columns = onehotencoder

In [38]:
categorical

Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_OTHER,STATE_TX,...,RFA_2R_L,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
95408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
95409,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
95410,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [39]:
data_concat = pd.DataFrame(np.concatenate((numerical,categorical),axis = 1)).reset_index(drop = True)
data_concat.columns = list(numerical.columns) + list(categorical.columns)

In [40]:
data_concat

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,RFA_2R_L,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,-0.699843,-0.056847,0.463551,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,0.745798,0.284659,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.755960,-0.055799,1.162350,-1.080356e+00,1.295872,0.717411,1.362283,-0.206977,-1.346527,1.675602,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.408683,-0.055799,-1.277349,-4.917093e-16,-0.008366,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-1.282165,-0.056847,0.036299,5.804901e-01,-0.877858,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-1.573325,-0.056847,-0.338895,1.134106e+00,-0.008366,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1.338282,-0.055799,-1.277349,-4.917093e-16,-1.312604,0.717411,-0.356881,2.556819,0.484257,1.145719,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
95408,1.338282,-0.055799,1.068082,-9.419524e-01,1.730618,0.717411,-0.249433,-0.206977,0.048356,0.880777,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
95409,1.047121,-0.055799,0.505291,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,-1.084987,1.079483,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
95410,-1.573325,-0.056847,0.600966,-2.499331e-01,1.730618,0.717411,-0.356881,-0.206977,-0.213185,0.350894,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [44]:
targets

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
...,...,...
95407,0,0.0
95408,0,0.0
95409,0,0.0
95410,1,18.0


In [45]:
targets = targets.drop(columns="TARGET_D")

In [46]:
X_train, X_test, y_train, y_test = train_test_split(data_concat, targets, test_size=0.2, random_state=42)

In [47]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model_score = model.score(X_test, y_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
model_score

0.9486977938479275

### Dealing with Imbalance Data

In [49]:
targets.value_counts()

TARGET_B
0           90569
1            4843
dtype: int64

### Under Sampling

In [61]:
us = RandomUnderSampler(random_state=0)

x_resampled, y_resampled = us.fit_resample(data_concat,targets)
y_resampled.value_counts()

TARGET_B
0           4843
1           4843
dtype: int64

In [67]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

In [68]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
score

0.5686274509803921

### Over Sampling

In [71]:
smote = SMOTE()
x_resampled,y_resampled = smote.fit_resample(data_concat,targets)
y_resampled.value_counts()

TARGET_B
0           90569
1           90569
dtype: int64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

In [73]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
score

0.6181682676382908