In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
def binary_to_int(x):
    num = 0
    m = len(x)-1
    for i in x:
        num += i*2**m
        m-=1
    return num

In [3]:
def encode(data, cols):
    encoder = LabelEncoder()
    for col in cols:
        data[col] = encoder.fit_transform(data[col])
    return data


In [4]:
def tt_split(data):
    train, test = train_test_split(data,  test_size=0.1, train_size=0.9, random_state=1)
    return train, test

In [5]:
def scale(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

    return X_train, X_test

## Pre-processing

In [6]:
data = pd.read_csv("train.csv")
z_test = pd.read_csv("test.csv")
sample = pd.read_csv("samplesubmission.csv" )

In [7]:
rcParams["figure.figsize"] = 20,10

In [8]:
data.shape, z_test.shape, sample.shape

((29132, 29), (10000, 29), (210000, 2))

### Missing value handling

In [9]:
data.join_date = data.join_date.fillna("1/5/2018")
z_test.join_date = z_test.join_date.fillna("1/5/2018")

### Added new features

In [10]:
data["subscribed"] = binary_to_int([data['P5DA'], data['RIBP'], 
                                     data['8NN1'],data['7POT'], data['66FJ'], 
                                     data['GYSR'], data['SOP4'], data['RVSZ'], 
                                     data['PYUQ'], data['LJR9'], data['N2MW'], 
                                     data['AHXO'],data['BSTQ'], data['FM3X'], 
                                     data['K6QO'], data['QBOL'], data['JWFN'], 
                                     data['JZ9D'], data['J9JW'], data['GHYX'], data['ECY3']])

In [11]:
z_test["subscribed"] = binary_to_int([z_test['P5DA'], z_test['RIBP'], 
                                     z_test['8NN1'],z_test['7POT'], z_test['66FJ'], 
                                     z_test['GYSR'], z_test['SOP4'], z_test['RVSZ'], 
                                     z_test['PYUQ'], z_test['LJR9'], z_test['N2MW'], 
                                     z_test['AHXO'],z_test['BSTQ'], z_test['FM3X'], 
                                     z_test['K6QO'], z_test['QBOL'], z_test['JWFN'], 
                                     z_test['JZ9D'], z_test['J9JW'], z_test['GHYX'], z_test['ECY3']])

In [12]:
data['join_month'] = data['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
data['join_year'] = data['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
data.drop('join_date', axis=1, inplace=True)

z_test['join_month'] = z_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
z_test['join_year'] = z_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
z_test.drop('join_date', axis=1, inplace=True)

data['join_age'] = data['join_year'] - data['birth_year']
z_test['join_age'] = z_test['join_year'] - z_test['birth_year']

In [13]:
data=data[['ID', 'join_month','join_year','join_age', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code','subscribed', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']]

z_test=z_test[['ID', 'join_month','join_year','join_age', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code','subscribed', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']]

### Axis transformation

In [14]:
data = data.melt(id_vars=data.columns[:11], value_vars=data.columns[11:], 
                     var_name = "PCODE", value_name="Label" )
z_test = z_test.melt(id_vars=z_test.columns[:11], value_vars=z_test.columns[11:], 
                   var_name = "PCODE", value_name="Label" )
melted_test = z_test[["ID","PCODE"]]

In [15]:
data.shape, z_test.shape, sample.shape

((611772, 13), (210000, 13), (210000, 2))

### encoding

In [16]:
data = encode(data,['sex', 'marital_status', 'branch_code', 'occupation_code',
                          'occupation_category_code', 'PCODE'])
z_test = encode(z_test,['sex', 'marital_status', 'branch_code', 'occupation_code',
                        'occupation_category_code', 'PCODE'])

### Train/test split

In [17]:
train, test = tt_split(data.iloc[:,1:])

In [18]:
train.shape, test.shape

((550594, 12), (61178, 12))

### Scaling

In [None]:
train, test = scale(train, test)

In [None]:
scaler = MinMaxScaler()
z_test  = pd.DataFrame(scaler.fit_transform(z_test.iloc[:,1:-1]), columns=z_test.iloc[:,1:-1].columns)

### Target split

In [19]:
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

X_test = test.iloc[:,:-1]
y_test = test.iloc[:,-1]

## Modeling

In [20]:
'''returns an array of accuracy scores for the given model.'''
def train_model(classifier, X_train, y_train):
    accuracy=[]  
    skf = StratifiedKFold(n_splits=5, random_state=None) 
    skf.get_n_splits(X_train, y_train) 
    
    for train_index, test_index in skf.split(X_train,y_train): 
        print("TRAIN:", train_index.min(),"to",train_index.max(), "TEST:",  
              test_index.min(),"to",test_index.max())
        
        X1_train, X1_test = X_train.iloc[train_index], X_train.iloc[test_index] 
        y1_train, y1_test = y_train.iloc[train_index], y_train.iloc[test_index] 

        classifier.fit(X1_train,y1_train)
        prediction=classifier.predict(X1_test) 
        score = metrics.log_loss(prediction,y1_test) 
        accuracy.append(score) 
        
    print("\nrank loss: ",accuracy)
    print("\nAverage rank loss =", np.array(accuracy).mean())
    
    return classifier

In [21]:
def get_accuracy(model, X_test, y_test):
    prediction = model.predict(X_test)
    score = metrics.log_loss(prediction,y_test) 
    return score

In [22]:
def get_proba(model, z_test):
    proba = model.predict_proba(z_test)[:,1]
    proba = pd.DataFrame(proba)
    
    return proba


In [23]:
def get_sub(proba):
    final = melted_test
    final["ID X PCODE"] = final["ID"] + " X " + final["PCODE"]
    final["Label"] = proba
    
    return final[["ID X PCODE","Label"]]

In [24]:
def fill_ones(sub):
    ones = sample[sample.Label == 1]    
    ones_index = ones.index
    for i in range(len(sub)):
        if i in ones_index:
            sub['Label'].iloc[i] = 1.0
    return sub

In [25]:
def export(df,name):
    df.to_csv(name, index=False)

### Algorithms

#### XGBoost

In [None]:
import xgboost as xgb

xg_boost = xgb.XGBClassifier()

xg_boost = train_model(xg_boost, X_train, y_train)

In [None]:
get_accuracy(xg_boost,X_test,y_test)

In [None]:
proba = get_proba(xg_boost, z_test)

In [None]:
sub = get_sub(proba)

In [None]:
sub = fill_ones(sub)

In [None]:
sub

In [None]:
# export(sub, "sub3.csv")

#### CatBoost

In [27]:
from catboost import CatBoostClassifier

cat_boost = CatBoostClassifier()
# cat_boost = train_model(cat_boost, X_train, y_train)

cat_boost.fit(X_train, 
          y_train, 
          cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code','subscribed'])

Learning rate set to 0.1525
0:	learn: 0.4828697	total: 2.29s	remaining: 38m 7s
1:	learn: 0.3579195	total: 4.01s	remaining: 33m 22s
2:	learn: 0.2798435	total: 4.83s	remaining: 26m 44s
3:	learn: 0.1832562	total: 6.51s	remaining: 27m
4:	learn: 0.1447278	total: 7.57s	remaining: 25m 6s
5:	learn: 0.1089717	total: 9.45s	remaining: 26m 5s
6:	learn: 0.0980574	total: 11.8s	remaining: 28m
7:	learn: 0.0918980	total: 13.4s	remaining: 27m 43s
8:	learn: 0.0843861	total: 15.1s	remaining: 27m 47s
9:	learn: 0.0777943	total: 17.3s	remaining: 28m 28s
10:	learn: 0.0742933	total: 18.5s	remaining: 27m 45s
11:	learn: 0.0720534	total: 20.1s	remaining: 27m 31s
12:	learn: 0.0685689	total: 21.6s	remaining: 27m 18s
13:	learn: 0.0618070	total: 23s	remaining: 27m
14:	learn: 0.0604839	total: 24.9s	remaining: 27m 16s
15:	learn: 0.0580875	total: 26s	remaining: 26m 39s
16:	learn: 0.0558500	total: 27.7s	remaining: 26m 40s
17:	learn: 0.0546851	total: 28.9s	remaining: 26m 17s
18:	learn: 0.0536525	total: 30.1s	remaining: 25

153:	learn: 0.0122196	total: 4m 19s	remaining: 23m 46s
154:	learn: 0.0122158	total: 4m 21s	remaining: 23m 43s
155:	learn: 0.0122105	total: 4m 23s	remaining: 23m 44s
156:	learn: 0.0122096	total: 4m 24s	remaining: 23m 41s
157:	learn: 0.0122045	total: 4m 26s	remaining: 23m 38s
158:	learn: 0.0122023	total: 4m 27s	remaining: 23m 36s
159:	learn: 0.0121842	total: 4m 29s	remaining: 23m 33s
160:	learn: 0.0121814	total: 4m 30s	remaining: 23m 28s
161:	learn: 0.0121809	total: 4m 32s	remaining: 23m 28s
162:	learn: 0.0121802	total: 4m 33s	remaining: 23m 26s
163:	learn: 0.0120064	total: 4m 35s	remaining: 23m 22s
164:	learn: 0.0120051	total: 4m 36s	remaining: 23m 18s
165:	learn: 0.0120035	total: 4m 38s	remaining: 23m 17s
166:	learn: 0.0119935	total: 4m 39s	remaining: 23m 14s
167:	learn: 0.0119895	total: 4m 40s	remaining: 23m 11s
168:	learn: 0.0119876	total: 4m 43s	remaining: 23m 13s
169:	learn: 0.0119843	total: 4m 44s	remaining: 23m 9s
170:	learn: 0.0119809	total: 4m 45s	remaining: 23m 5s
171:	learn: 

303:	learn: 0.0090749	total: 8m 8s	remaining: 18m 38s
304:	learn: 0.0090695	total: 8m 9s	remaining: 18m 35s
305:	learn: 0.0090673	total: 8m 11s	remaining: 18m 34s
306:	learn: 0.0090602	total: 8m 12s	remaining: 18m 30s
307:	learn: 0.0090580	total: 8m 13s	remaining: 18m 29s
308:	learn: 0.0090530	total: 8m 15s	remaining: 18m 27s
309:	learn: 0.0090167	total: 8m 16s	remaining: 18m 24s
310:	learn: 0.0090134	total: 8m 18s	remaining: 18m 23s
311:	learn: 0.0090042	total: 8m 21s	remaining: 18m 25s
312:	learn: 0.0089723	total: 8m 23s	remaining: 18m 24s
313:	learn: 0.0089714	total: 8m 25s	remaining: 18m 24s
314:	learn: 0.0089687	total: 8m 28s	remaining: 18m 24s
315:	learn: 0.0089661	total: 8m 29s	remaining: 18m 22s
316:	learn: 0.0089629	total: 8m 30s	remaining: 18m 20s
317:	learn: 0.0089606	total: 8m 32s	remaining: 18m 18s
318:	learn: 0.0089601	total: 8m 33s	remaining: 18m 15s
319:	learn: 0.0089540	total: 8m 35s	remaining: 18m 15s
320:	learn: 0.0089502	total: 8m 36s	remaining: 18m 12s
321:	learn: 

452:	learn: 0.0079698	total: 11m 43s	remaining: 14m 9s
453:	learn: 0.0079680	total: 11m 45s	remaining: 14m 8s
454:	learn: 0.0079680	total: 11m 46s	remaining: 14m 5s
455:	learn: 0.0079671	total: 11m 48s	remaining: 14m 5s
456:	learn: 0.0079650	total: 11m 49s	remaining: 14m 3s
457:	learn: 0.0079616	total: 11m 52s	remaining: 14m 2s
458:	learn: 0.0079612	total: 11m 53s	remaining: 14m 1s
459:	learn: 0.0079612	total: 11m 54s	remaining: 13m 59s
460:	learn: 0.0079612	total: 11m 54s	remaining: 13m 55s
461:	learn: 0.0079612	total: 11m 55s	remaining: 13m 53s
462:	learn: 0.0079605	total: 11m 56s	remaining: 13m 51s
463:	learn: 0.0079604	total: 11m 57s	remaining: 13m 48s
464:	learn: 0.0079567	total: 11m 59s	remaining: 13m 47s
465:	learn: 0.0079567	total: 12m	remaining: 13m 45s
466:	learn: 0.0079567	total: 12m	remaining: 13m 42s
467:	learn: 0.0079554	total: 12m 2s	remaining: 13m 40s
468:	learn: 0.0079554	total: 12m 2s	remaining: 13m 38s
469:	learn: 0.0079554	total: 12m 2s	remaining: 13m 34s
470:	learn

600:	learn: 0.0077806	total: 13m 53s	remaining: 9m 13s
601:	learn: 0.0077806	total: 13m 54s	remaining: 9m 11s
602:	learn: 0.0077806	total: 13m 55s	remaining: 9m 9s
603:	learn: 0.0077806	total: 13m 55s	remaining: 9m 7s
604:	learn: 0.0077806	total: 13m 56s	remaining: 9m 6s
605:	learn: 0.0077806	total: 13m 57s	remaining: 9m 4s
606:	learn: 0.0077804	total: 13m 57s	remaining: 9m 2s
607:	learn: 0.0077804	total: 13m 58s	remaining: 9m
608:	learn: 0.0077804	total: 13m 59s	remaining: 8m 58s
609:	learn: 0.0077804	total: 13m 59s	remaining: 8m 56s
610:	learn: 0.0077804	total: 14m	remaining: 8m 54s
611:	learn: 0.0077804	total: 14m 1s	remaining: 8m 53s
612:	learn: 0.0077804	total: 14m 1s	remaining: 8m 51s
613:	learn: 0.0077804	total: 14m 2s	remaining: 8m 49s
614:	learn: 0.0077804	total: 14m 3s	remaining: 8m 47s
615:	learn: 0.0077804	total: 14m 3s	remaining: 8m 46s
616:	learn: 0.0077804	total: 14m 4s	remaining: 8m 44s
617:	learn: 0.0077804	total: 14m 4s	remaining: 8m 42s
618:	learn: 0.0077804	total: 1

751:	learn: 0.0072224	total: 16m 44s	remaining: 5m 31s
752:	learn: 0.0072199	total: 16m 46s	remaining: 5m 30s
753:	learn: 0.0072122	total: 16m 47s	remaining: 5m 28s
754:	learn: 0.0072095	total: 16m 49s	remaining: 5m 27s
755:	learn: 0.0072094	total: 16m 50s	remaining: 5m 26s
756:	learn: 0.0072094	total: 16m 52s	remaining: 5m 24s
757:	learn: 0.0072072	total: 16m 53s	remaining: 5m 23s
758:	learn: 0.0072072	total: 16m 54s	remaining: 5m 22s
759:	learn: 0.0071988	total: 16m 56s	remaining: 5m 21s
760:	learn: 0.0071958	total: 16m 58s	remaining: 5m 19s
761:	learn: 0.0071930	total: 17m	remaining: 5m 18s
762:	learn: 0.0071927	total: 17m 1s	remaining: 5m 17s
763:	learn: 0.0071913	total: 17m 3s	remaining: 5m 16s
764:	learn: 0.0071880	total: 17m 4s	remaining: 5m 14s
765:	learn: 0.0071875	total: 17m 5s	remaining: 5m 13s
766:	learn: 0.0071811	total: 17m 7s	remaining: 5m 12s
767:	learn: 0.0071808	total: 17m 8s	remaining: 5m 10s
768:	learn: 0.0071556	total: 17m 9s	remaining: 5m 9s
769:	learn: 0.0071522	

901:	learn: 0.0067692	total: 20m 7s	remaining: 2m 11s
902:	learn: 0.0067671	total: 20m 9s	remaining: 2m 9s
903:	learn: 0.0067669	total: 20m 9s	remaining: 2m 8s
904:	learn: 0.0067669	total: 20m 10s	remaining: 2m 7s
905:	learn: 0.0067655	total: 20m 12s	remaining: 2m 5s
906:	learn: 0.0067632	total: 20m 13s	remaining: 2m 4s
907:	learn: 0.0067612	total: 20m 15s	remaining: 2m 3s
908:	learn: 0.0067611	total: 20m 16s	remaining: 2m 1s
909:	learn: 0.0067605	total: 20m 17s	remaining: 2m
910:	learn: 0.0067539	total: 20m 19s	remaining: 1m 59s
911:	learn: 0.0067527	total: 20m 20s	remaining: 1m 57s
912:	learn: 0.0067495	total: 20m 22s	remaining: 1m 56s
913:	learn: 0.0067495	total: 20m 23s	remaining: 1m 55s
914:	learn: 0.0067464	total: 20m 24s	remaining: 1m 53s
915:	learn: 0.0067443	total: 20m 25s	remaining: 1m 52s
916:	learn: 0.0067420	total: 20m 27s	remaining: 1m 51s
917:	learn: 0.0067411	total: 20m 29s	remaining: 1m 49s
918:	learn: 0.0067411	total: 20m 29s	remaining: 1m 48s
919:	learn: 0.0067378	to

<catboost.core.CatBoostClassifier at 0x7fef6a561490>

In [None]:

# cat_boost = CatBoostClassifier()
# cat_boost = train_model(cat_boost, X_train, y_train)


In [28]:
get_accuracy(cat_boost,X_test,y_test)

0.051940648588492885

In [30]:
proba = get_proba(cat_boost, z_test.iloc[:,1:-1])
sub = get_sub(proba)
sub = fill_ones(sub)
sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentati

Unnamed: 0,ID X PCODE,Label
0,F86J5PC X P5DA,0.034804
1,H6141K3 X P5DA,0.000079
2,RBAYUXZ X P5DA,0.000104
3,KCBILBQ X P5DA,0.029814
4,LSEC1ZJ X P5DA,0.046195
...,...,...
209995,0GMU5UH X ECY3,0.001310
209996,I9W11CD X ECY3,0.072085
209997,42WTEGT X ECY3,0.023528
209998,8EKC4O9 X ECY3,0.060294


In [31]:
export(sub, "sub5.csv")

In [None]:
#Very poor on test data - CatBoost

In [None]:
X_train