
# Final Project: Campaign Response Model

Author: George Cherukara Joggy

## 1. Problem Statement
#### Use the  campaign response from a targeted group of existing consumers of a financial card issuing organization to apply on the rest of the consumer pool to predict the profile of consumers that will respond to similair campaigns

## 2. Python Code

### 1. Data Preparation

#### import all necessary libraries

In [138]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

#### import the promotional dataset and save it to a dataframe

In [139]:
promoted = pd.read_csv('./datasets/promoted.csv')
promoted.head()
#cars_training.shape

Unnamed: 0,customer_id,resp,card_tenure,risk_score,num_promoted,avg_bal,geo_group,res_type
0,SB0000024239,0,143.0,520,0,4108.0,E,TO
1,SB0000103256,0,91.0,694,0,3268.0,N,CO
2,SB0000115357,0,139.0,520,0,3785.0,E,
3,SB0000060380,0,147.0,676,0,3402.0,E,CO
4,SB0000138548,0,222.0,717,0,3580.0,SE,CO


### 2. Data cleanup and EDA

In [140]:
#Define a EDA function to include the following details:  
#total nulls, index, data types, shape, summary statistics, and the number of unique values for each column
def eda_analysis(data):
    print(data.head())
    print(data.describe(include='all'))
    print(data.isnull().sum())
    print(data.info())



In [141]:
eda_analysis(promoted)

    customer_id  resp  card_tenure  risk_score  num_promoted  avg_bal  \
0  SB0000024239     0        143.0         520             0   4108.0   
1  SB0000103256     0         91.0         694             0   3268.0   
2  SB0000115357     0        139.0         520             0   3785.0   
3  SB0000060380     0        147.0         676             0   3402.0   
4  SB0000138548     0        222.0         717             0   3580.0   

  geo_group res_type  
0         E       TO  
1         N       CO  
2         E      NaN  
3         E       CO  
4        SE       CO  
         customer_id          resp   card_tenure    risk_score  num_promoted  \
count          25000  25000.000000  24515.000000  25000.000000  25000.000000   
unique         25000           NaN           NaN           NaN           NaN   
top     SB0000014636           NaN           NaN           NaN           NaN   
freq               1           NaN           NaN           NaN           NaN   
mean             NaN   

#### Fill the null values from float variables with median values

In [142]:
promoted["avg_bal"].fillna(promoted["avg_bal"].median(), inplace=True)
promoted["card_tenure"].fillna(promoted["card_tenure"].median(), inplace=True)
promoted.dropna(axis=1,inplace=True)

In [143]:
#promoted.geo_group.value_counts()
promoted.columns

Index(['customer_id', 'resp', 'card_tenure', 'risk_score', 'num_promoted',
       'avg_bal', 'geo_group', 'res_type'],
      dtype='object')

#### convert the categorical variables to dummy variables

In [144]:
#cat_col = ['geo_group','res_type']
#promoted_nw = pd.get_dummies(promoted,columns=cat_col,drop_first=True)
#promoted_nw.columns

Index(['customer_id', 'resp', 'card_tenure', 'risk_score', 'num_promoted',
       'avg_bal', 'geo_group_N', 'geo_group_SE', 'geo_group_W', 'res_type_CO',
       'res_type_RE', 'res_type_SI', 'res_type_TO'],
      dtype='object')

In [None]:
#promoted_nw.head()
#promoted_nw.isnull().sum()
#promoted_nw.describe()

#### understand the class balance of resp variable

In [146]:
#understand class variance of the target variable
promoted_nw.resp.value_counts()

0    23284
1     1716
Name: resp, dtype: int64

resp class balance is 0(93.2%) and 1(6.8%)

#### plot the variables to understand correlation

#### fix the class imbalance of resp using SMOTEtomek

In [153]:
feature_col_nw = ['card_tenure', 'risk_score', 'avg_bal', 'geo_group_SE']
#X.columns
target_col = ['resp']

In [154]:
from imblearn.combine import SMOTETomek
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

X_nw = pd.DataFrame(X_resampled, columns=feature_col_nw)
y_nw = pd.DataFrame(y_resampled, columns=target_col)

[(0, 22101), (1, 22101)]


### 3. Model training and fit

#### do  scaling using StandardScaler

In [155]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_trn = scaler.fit_transform(X_nw)

#### perform train test split

In [156]:
X_train, X_test, y_train, y_test=train_test_split(X_trn,y_nw, test_size=0.7, random_state=7)

#### 3.1 Run a Logistic Regression model

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression(penalty='l1')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.62


  y = column_or_1d(y, warn=True)


In [158]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

[[ 8798  6648]
 [ 4998 10498]]


In [159]:
metrics.roc_auc_score(y_test,y_pred)

0.623531229521725

In [160]:
print(metrics.classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.64      0.57      0.60     15446
          1       0.61      0.68      0.64     15496

avg / total       0.62      0.62      0.62     30942



In [161]:
from sklearn.ensemble import RandomForestClassifier as RF
clf = RF()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

  This is separate from the ipykernel package so we can avoid doing imports until


             precision    recall  f1-score   support

          0       0.83      0.88      0.86     15446
          1       0.87      0.83      0.85     15496

avg / total       0.85      0.85      0.85     30942



In [162]:
print(metrics.accuracy_score(y_test,y_pred))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

0.8515286665373926
[[13545  1901]
 [ 2693 12803]]


#### Do cross validation test to see if the model is overfitting

In [163]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

# Run CV sores for 5 folds
print('------------------------------------\n')
print('K: {}'.format(5))
model = RF(random_state = 7)
    
    # Perform cross-validation
scores = cross_val_score(model, X_trn, y_nw, cv=5)
print("Cross-validated scores: {}".format(scores))
    
    # Make cross-validated predictions
predictions = cross_val_predict(model, X_trn, y_nw, cv=5)
       
rocauc= metrics.roc_auc_score(y_nw, predictions)
print("ROC AUC SCORE: {}".format(rocauc))
cm = metrics.confusion_matrix(y_nw, predictions)
print(cm)

------------------------------------

K: 5


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Cross-validated scores: [0.85240896 0.93167421 0.93393665 0.92918552 0.92669683]


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ROC AUC SCORE: 0.9147776118727659
[[20507  1594]
 [ 2173 19928]]


In [164]:
#model metrics from the original RF classifier
print(metrics.accuracy_score(y_test,y_pred))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

0.8515286665373926
[[13545  1901]
 [ 2693 12803]]


In [165]:
#cross-validation scores
print(metrics.accuracy_score(y_nw, predictions))
cm = metrics.confusion_matrix(y_nw, predictions)
print(cm)

0.914777611872766
[[20507  1594]
 [ 2173 19928]]


#### Understand the important features that drive the prediction for the randomforest classifier

In [166]:
featureImportance = clf.feature_importances_
for i in range(len(featureImportance)):
    featureImportance[i]
feat = pd.DataFrame() 
feat["features"] = feature_col_nw
feat['importance'] = featureImportance.data
feat

Unnamed: 0,features,importance
0,card_tenure,0.397592
1,risk_score,0.225052
2,avg_bal,0.3181
3,geo_group_SE,0.059256
