In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin

from mypipes import *

In [2]:
train_file=r'C:\Users\chirag\Desktop\ML IITK\ML Intro\DATA\rg_train.csv'
test_file=r'C:\Users\chirag\Desktop\ML IITK\ML Intro\DATA\rg_test.csv'

bd_train=pd.read_csv(train_file)
bd_test=pd.read_csv(test_file)

In [3]:
bd_train.head()

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment.Tax.Saving.Bond,Home.Loan,Online.Purchase.Amount,Revenue.Grid,gender,region,Investment.in.Commudity,Investment.in.Equity,Investment.in.Derivative,Portfolio.Balance
0,2148,1,45-50,Partner,Professional,Professional,Rent Privately,">=35,000",Yes,Yes,...,7.49,2.48,0.0,2,Female,South West,65.87,9.27,30.93,87.48
1,8099,1,61-65,Partner,Retired,Retired,Own Home,"<12,500, >=10,000",No,No,...,0.0,3.99,0.0,2,Female,Unknown,42.46,4.49,26.23,110.73
2,6611,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.0,0.0,0.0,2,Male,East Anglia,75.38,0.0,26.66,127.57
3,1950,Zero,55-60,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,2.0,0.0,0.0,2,Female,North West,34.78,6.91,29.24,33.79
4,10857,2,51-55,Partner,Manual Worker,Manual Worker,Own Home,"<27,500, >=25,000",Yes,Yes,...,0.0,0.0,0.0,2,Female,South West,48.58,9.58,20.65,56.17


In [4]:
bd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REF_NO                           8124 non-null   int64  
 1   children                         8124 non-null   object 
 2   age_band                         8124 non-null   object 
 3   status                           8124 non-null   object 
 4   occupation                       8124 non-null   object 
 5   occupation_partner               8124 non-null   object 
 6   home_status                      8124 non-null   object 
 7   family_income                    8124 non-null   object 
 8   self_employed                    8124 non-null   object 
 9   self_employed_partner            8124 non-null   object 
 10  year_last_moved                  8124 non-null   int64  
 11  TVarea                           8124 non-null   object 
 12  post_code           

In [5]:
# REF_NO : drop 
# children : covert zero to 0 and 4+ to 4 then convert to numeric
# age_band : 71+->71, Unknown:NA, rest: split and average
# status, occupation, occupation_partner, home_status : create dummies with freq cutoff
# family_income: string processing to remove (,<,>=), 35000:35000, 4000:4000, unknown:NA, rest: split average
# self_employed,self _employed_partner: dummies
# year_last_moved: keep as it is
# TVarea : dummy
# post_code, post_area : drop (should try group categoty on the basis of target variable)
# Average.Credit.Card.Transaction, Balance.Transfer, Term.Deposit, Life.Insurance, Medical.Insurance,
# Average.A.C.Balance, Personal.Loan,Investment.in.Mutual.Fund,
# Investment.Tax.Saving.Bond, Home.Loan, Online.Purchase.Amount,
# Investment.in.Commudit, Investment.in.Equity, Investment.in.Derivative, Portfolio.Balance : as it is 
# revenue.grid : target
# gender : dummies
# region : dummies

In [6]:
bd_train.columns

Index(['REF_NO', 'children', 'age_band', 'status', 'occupation',
       'occupation_partner', 'home_status', 'family_income', 'self_employed',
       'self_employed_partner', 'year_last_moved', 'TVarea', 'post_code',
       'post_area', 'Average.Credit.Card.Transaction', 'Balance.Transfer',
       'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
       'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
       'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount',
       'Revenue.Grid', 'gender', 'region', 'Investment.in.Commudity',
       'Investment.in.Equity', 'Investment.in.Derivative',
       'Portfolio.Balance'],
      dtype='object')

In [7]:
num_vars = list(bd_train.select_dtypes(exclude=['object']).columns)

In [8]:
num_vars

['REF_NO',
 'year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Revenue.Grid',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [9]:
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [10]:
num_vars

['year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [11]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [12]:
cat_vars

['children',
 'age_band',
 'status',
 'occupation',
 'occupation_partner',
 'home_status',
 'family_income',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'post_code',
 'post_area',
 'gender',
 'region']

In [13]:
cat_vars=[_ for _ in cat_vars if _ not in 
         ['children','age_band','post_code','post_area','family_income']]

In [14]:
cat_vars

['status',
 'occupation',
 'occupation_partner',
 'home_status',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'gender',
 'region']

In [15]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [16]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [17]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                    columns=data_pipe.get_feature_names())

In [18]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                   columns=data_pipe.get_feature_names())

In [19]:
bd_train['Revenue.Grid'].value_counts(dropna=False)

2    7261
1     863
Name: Revenue.Grid, dtype: int64

In [20]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [21]:
x_train.shape

(8124, 71)

In [22]:
x_test.shape

(2031, 71)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
params={'class_weight':['balanced',None],
        'penalty':['l2'],
        'C':[0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,1,2,5] 
}

In [25]:
model=LogisticRegression(fit_intercept=True)

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
grid_search=GridSearchCV(model,
                        param_grid=params,
                        cv=10,
                        scoring="roc_auc",
                        n_jobs=-1,
                        verbose=20)

In [28]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1,
                               2, 5],
                         'class_weight': ['balanced', None],
                         'penalty': ['l2']},
             scoring='roc_auc', verbose=20)

In [29]:
grid_search.best_estimator_

LogisticRegression(C=0.0005, class_weight='balanced')

In [30]:
logr=grid_search.best_estimator_

In [31]:
def report(results,n_top=3):
    for i in range(1,n_top+1):
        candidates=np.flatnonzero(results['rank_test_score']==i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean Validation Score: {0:.6f} (std: {1:.6f})".format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [32]:
report(grid_search.cv_results_,5)

Model with rank: 1
Mean Validation Score: 0.951928 (std: 0.013743)
Parameters: {'C': 0.0005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 2
Mean Validation Score: 0.951387 (std: 0.013879)
Parameters: {'C': 0.05, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 3
Mean Validation Score: 0.950887 (std: 0.013145)
Parameters: {'C': 0.005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 4
Mean Validation Score: 0.950519 (std: 0.013933)
Parameters: {'C': 0.0001, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 5
Mean Validation Score: 0.950274 (std: 0.013001)
Parameters: {'C': 2, 'class_weight': 'balanced', 'penalty': 'l2'}



In [33]:
logr.fit(x_train,y_train)

LogisticRegression(C=0.0005, class_weight='balanced')

In [34]:
(logr.coef_[0]==0).sum()

0

In [35]:
list(zip(x_train.columns,logr.coef_[0]))

[('num__year_last_moved', -0.0010428745965502585),
 ('num__Average.Credit.Card.Transaction', 0.021084288096046972),
 ('num__Balance.Transfer', -0.007306171394717334),
 ('num__Term.Deposit', -0.018671109275666974),
 ('num__Life.Insurance', 0.010549665723859862),
 ('num__Medical.Insurance', -0.009707844462505421),
 ('num__Average.A.C.Balance', -0.010223827707548734),
 ('num__Personal.Loan', -0.037292768069571984),
 ('num__Investment.in.Mutual.Fund', -0.00275690741807666),
 ('num__Investment.Tax.Saving.Bond', 0.08484068095676131),
 ('num__Home.Loan', -0.019110033108242868),
 ('num__Online.Purchase.Amount', 0.04913562751860154),
 ('num__Investment.in.Commudity', -0.000810247290492203),
 ('num__Investment.in.Equity', 0.010764840905584286),
 ('num__Investment.in.Derivative', 0.005902415551826779),
 ('num__Portfolio.Balance', 0.004930157609549365),
 ('obj_to_dum__status_Partner', 0.00014511594945676792),
 ('obj_to_dum__status_Single/Never Married', -1.0852266405570701e-05),
 ('obj_to_dum__sta

In [36]:
logr.predict_proba(x_test)

array([[0.98748641, 0.01251359],
       [0.9535428 , 0.0464572 ],
       [0.96344557, 0.03655443],
       ...,
       [0.96315592, 0.03684408],
       [0.69238443, 0.30761557],
       [0.7856795 , 0.2143205 ]])

In [37]:
logr.classes_

array([0, 1])

In [38]:
cutoffs=np.linspace(0.001,0.99,99)

cutoffs

array([0.001     , 0.01109184, 0.02118367, 0.03127551, 0.04136735,
       0.05145918, 0.06155102, 0.07164286, 0.08173469, 0.09182653,
       0.10191837, 0.1120102 , 0.12210204, 0.13219388, 0.14228571,
       0.15237755, 0.16246939, 0.17256122, 0.18265306, 0.1927449 ,
       0.20283673, 0.21292857, 0.22302041, 0.23311224, 0.24320408,
       0.25329592, 0.26338776, 0.27347959, 0.28357143, 0.29366327,
       0.3037551 , 0.31384694, 0.32393878, 0.33403061, 0.34412245,
       0.35421429, 0.36430612, 0.37439796, 0.3844898 , 0.39458163,
       0.40467347, 0.41476531, 0.42485714, 0.43494898, 0.44504082,
       0.45513265, 0.46522449, 0.47531633, 0.48540816, 0.4955    ,
       0.50559184, 0.51568367, 0.52577551, 0.53586735, 0.54595918,
       0.55605102, 0.56614286, 0.57623469, 0.58632653, 0.59641837,
       0.6065102 , 0.61660204, 0.62669388, 0.63678571, 0.64687755,
       0.65696939, 0.66706122, 0.67715306, 0.6872449 , 0.69733673,
       0.70742857, 0.71752041, 0.72761224, 0.73770408, 0.74779

In [39]:
logr.predict_proba(x_train)

array([[0.9885378 , 0.0114622 ],
       [0.85351196, 0.14648804],
       [0.92643243, 0.07356757],
       ...,
       [0.47197348, 0.52802652],
       [0.9455578 , 0.0544422 ],
       [0.98365497, 0.01634503]])

In [41]:
logr.classes_

array([0, 1])

In [42]:
train_score=logr.predict_proba(x_train)[:,1]

In [43]:
real=y_train

In [44]:
(train_score>0.2).astype(int)

array([0, 0, 0, ..., 1, 0, 0])

In [45]:
KS_all=[]

for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    KS=(TP/P)-(FP/N)
    
    KS_all.append(KS)
    

In [46]:
list(zip(cutoffs,KS_all))

[(0.001, 0.033466464674287266),
 (0.011091836734693876, 0.09824339719988517),
 (0.021183673469387754, 0.14688306853085653),
 (0.03127551020408163, 0.20097465099901168),
 (0.04136734693877551, 0.25024596077745476),
 (0.05145918367346938, 0.30466995933608065),
 (0.061551020408163265, 0.3698125016856192),
 (0.07164285714285713, 0.43627529286687416),
 (0.08173469387755102, 0.4946124815140428),
 (0.0918265306122449, 0.5373301035405106),
 (0.10191836734693877, 0.572473011340288),
 (0.11201020408163265, 0.6017508417723347),
 (0.12210204081632653, 0.6238671242082376),
 (0.1321938775510204, 0.6481062097336474),
 (0.14228571428571427, 0.6635548605440293),
 (0.15237755102040815, 0.6796113715985799),
 (0.16246938775510203, 0.6900450557056278),
 (0.1725612244897959, 0.7023830706852574),
 (0.1826530612244898, 0.7126552545121534),
 (0.19274489795918365, 0.7199213308516761),
 (0.20283673469387753, 0.7293671822174786),
 (0.2129285714285714, 0.7383191172126584),
 (0.2230204081632653, 0.7448728049646336)

In [47]:
mycutoff=cutoffs[KS_all==max(KS_all)]
mycutoff

array([0.4955])

In [48]:
logr.predict_proba(x_test)

array([[0.98748641, 0.01251359],
       [0.9535428 , 0.0464572 ],
       [0.96344557, 0.03655443],
       ...,
       [0.96315592, 0.03684408],
       [0.69238443, 0.30761557],
       [0.7856795 , 0.2143205 ]])

In [49]:
test_score=logr.predict_proba(x_test)[:,1]
test_score

array([0.01251359, 0.0464572 , 0.03655443, ..., 0.03684408, 0.30761557,
       0.2143205 ])

In [50]:
(test_score>mycutoff).astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [51]:
pd.DataFrame(test_score).to_csv("mySubmission_logistic",index=False)

In [52]:
# hard classes
test_classes=(test_score>mycutoff).astype(int)

In [53]:
test_classes

array([0, 0, 0, ..., 0, 0, 0])