In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
import time
import os
from dateutil.parser import parse
import pandas_profiling
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from pandas.core.dtypes.common import (is_numeric_dtype, is_datetime64_dtype, is_bool_dtype)
import optuna   
from sklearn.model_selection import train_test_split
import sklearn
import xgboost as xgb 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from scipy import stats as s

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
def import_data(filename):
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    name,ext = os.path.splitext(filename)
    if ext == '.csv':
        data = pd.read_csv(filename,low_memory=False,parse_dates= True,infer_datetime_format = True,date_parser = dateparse)
    if ext == '.json':
        data = pd.read_json(filename)
    if ext =='.xlsx' or ext == '.xls':
        data == pd.read_excel(filename)
    return data

In [None]:
train = import_data('/home/divyanka/Downloads/train_s3TEQDk.csv')
test = import_data('/home/divyanka/Downloads/test_mSzZ8RL.csv')

In [None]:
from dateutil.parser import parse

def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    
def d_type(df):
    for col in df[:1]:
        val = df[:1][col]
        if val.dtypes == object:
            try:
                if is_date(val.astype(str).values[0]):
                    df[col] = pd.to_datetime(df[col])
            except Exception as e:
                pass
    return df
train = d_type(train)

# DataFrame

In [None]:
train.head()

In [None]:
test.head()

# Data Information

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.nunique()

In [None]:
test.nunique()

# Missing And Zero Percentage

In [None]:
def missing_zero_values_table(data):
    zero_val = (data == 0).sum(axis=0)
    #zero_val = (df == "0").astype(int).sum(axis=0)
    mis_val = data.isnull().sum()
    mis_val_percent = 100 * data.isnull().sum() / len(data)
    mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
    columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
    mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
    mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(data)
    mz_table['Data Type'] = data.dtypes
    mz_table = mz_table[
    mz_table.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(data.shape[1]) + " columns and " + str(data.shape[0]) + " Rows.\n" 
    "There are " + str(mz_table.shape[0]) +
    " columns that have missing values. \n")
    # mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
    return mz_table

missing = missing_zero_values_table(train)
print(missing)

# Non Duplicate, Normal, Balanced

In [None]:
def check_non_unique_and_dup(df, n=2):
    """ Remove all the columns having less than 2 unique values and duplicate records"""
    try:
        print("Number of duplicate records : ",len(df) - len(df.drop_duplicates()))
        #df.drop_duplicates(inplace=True)
        col_list = []
        cnt = 0
        for col in df.columns:
            if df[col].nunique() <= n:
                #df.drop(col,inplace=True)
                col_list.append(col)
                #cnt = cnt + 1
        print("Number of non unique and duplicate columns : ",len(col_list))
        print("Column Names:")
        print(col_list)
    except Exception as e:
        print(e)

def check_norm(df,alpha=0.5):
    """Returns 1 if the data is normal  otherwise 0,saves the plots of the distribution in /plots directory"""
    try:
        col_list=[]
        for column in df.columns:
            if is_numeric_dtype(df[column]):
                #plt.hist(df[column], color='blue', edgecolor='black')
        #plt.savefig(directory + col_name + "_normplot"+time.strftime("%Y%m%d-%H%M%S"))
        #plt.close()
                #stats.probplot(df[column], plot=plt)
        #plt.savefig(directory + col_name + "_probplot"+time.strftime("%Y%m%d-%H%M%S"))
        #plt.close()
                stat, p = normaltest(df[column])
                if  not p > alpha:
                    col_list.append(column)
        return col_list
    except Exception as e:
        print(e)
        
def check_balanced(df):
    """Returns the list of column features which are unbalanced"""
    try:
        unbalanced = []
        for column in df.columns:    
            l = len(df[column])
            nun = 100/df[column].nunique()
            val_high , val_low = nun+(nun*0.2),nun-(nun*0.2)
            for cat in df[column].unique():
                chk_percent = (len(df[df[column]==cat])/l)*100
                if chk_percent > val_high or chk_percent < val_low:
                    unbalanced.append(column)
        return list(set(unbalanced))

    except Exception as e:
        print(e)

**List of non unique and duplicate columns**

In [None]:
check_non_unique_and_dup(train)

**List of columns not having normal data**

In [None]:
check_norm(train)

**List of columns having unbalanced feature**

In [None]:
check_balanced(train)

# Numeric Data

In [None]:
data_num = train.select_dtypes(include=[np.number])

In [None]:
data_num.describe()

In [None]:
def describe_by_num_type(dataframe):
    print('***********Skewness************\n')
    print(data_num.skew())
    print('\n***********Kurtosis************\n')
    print(data_num.kurtosis())
    print('\n***********Sum*************\n')
    print(data_num.sum())
    print('\n***********Median**********\n')
    print(data_num.median())
    print('\n***********Variance********\n')
    print(data_num.var())
    print('\n***********Mean Absolute Deviation***********\n')
    print(data_num.mad())

In [None]:
describe_by_num_type(data_num)

In [None]:
print('\n**********Correlation**********\n')
data_num.corr()

In [None]:
print('\n************Co-Variance*************\n')
data_num.cov()

# Bool/Object Data

In [None]:
data_bool = train.select_dtypes(include=[np.object])

In [None]:
data_bool.describe()

# Datetime Data

In [None]:
data_datetime = train.select_dtypes(include=[np.datetime64])

In [None]:
data_datetime.describe()

# Value Counts for each column

In [None]:
def category_counts(dataframe):
    """
    prints value counts for each (categorical) column
    :param dataframe: a pandas DataFrame
    :param max_nunique: the max number of unique values a column can have for
                        its value counts to be printed; no limit is set if None
    :param numeric: boolean; if True, value counts for numeric data are also
                    printed
    :param datetime: boolean; if True, value counts for datetime data are also
                     printed
    :return: None
    """
    for column in dataframe.columns:
        col = dataframe[column]
        
        #if is_bool_dtype(col):
         #   print(col.value_counts().nlargest(5))
          #  print('\n')
        if is_numeric_dtype(col):
            print(col.value_counts().nlargest(10))
            print('\n')
        elif is_datetime64_dtype(col):
            print(col.value_counts().nlargest(10))
            print('\n')
        else:
            print(col.value_counts().nlargest(10))
            print('\n')
            
        '''if not any([
            max_nunique is not None and col.nunique() > max_nunique,
            not numeric and is_numeric_dtype(col),
            not datetime and is_datetime64_dtype(col)
        ]):
            print(col.value_counts())
            print('\n')'''

In [None]:
category_counts(train)

In [None]:
pfr = pandas_profiling.ProfileReport(train)
pfr

In [None]:
import sweetviz as sv

report = sv.analyze(train)
report.show_notebook()

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
data = AV.AutoViz('/home/divyanka/Downloads/train_s3TEQDk.csv')

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=train['Age'],name='Age Box Plot',boxmean=True)
tr2=go.Histogram(x=train['Age'],name='Age Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=train['Avg_Account_Balance'],name='Avg Account Balance Box Plot',boxmean=True)
tr2=go.Histogram(x=train['Avg_Account_Balance'],name='Avg Account Balance Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Avg Account Balance")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=train['Vintage'],name='Vintage Box Plot',boxmean=True)
tr2=go.Histogram(x=train['Vintage'],name='Vintage Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Vintage")
fig.show()

In [None]:
g = sns.FacetGrid(train, col = "Is_Lead")
g.map(sns.distplot, "Age", bins = 20)
plt.show()

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('','<b>Is Active','<b>Not Active<b>','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=train.Gender.value_counts().values,labels=['<b>Female<b>','<b>Male<b>'],hole=0.3,pull=[0,0.3]),
    row=1, col=1
)

fig.add_trace(
    go.Pie(
        labels=['Active Female','Active Male'],
        values=train.query('Is_Active=="Yes"').Gender.value_counts().values,
        pull=[0,0.05,0.5],
        hole=0.3
        
    ),
    row=1, col=2
)

fig.add_trace(
    go.Pie(
        labels=['Female Not Active','Male Not Active'],
        values=train.query('Is_Active=="No"').Gender.value_counts().values,
        pull=[0,0.2,0.5],
        hole=0.3
    ),
    row=2, col=2
)



fig.update_layout(
    height=800,
    showlegend=True,
    title_text="<b>Distribution Of Gender And Active Statuses<b>",
)

fig.show()

In [None]:
train.columns

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('','<b> Credit Product','<b>No Credit Product<b>','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=train.Gender.value_counts().values,labels=['<b>Female<b>','<b>Male<b>'],hole=0.3,pull=[0,0.3]),
    row=1, col=1
)

fig.add_trace(
    go.Pie(
        labels=['Female With Credit Product','Male With Credit Product'],
        values=train.query('Credit_Product=="Yes"').Gender.value_counts().values,
        pull=[0,0.05,0.5],
        hole=0.3
        
    ),
    row=1, col=2
)

fig.add_trace(
    go.Pie(
        labels=['Female Not With Credit Product','Male Not With Credit Product'],
        values=train.query('Credit_Product=="No"').Gender.value_counts().values,
        pull=[0,0.2,0.5],
        hole=0.3
    ),
    row=2, col=2
)



fig.update_layout(
    height=800,
    showlegend=True,
    title_text="<b>Distribution Of Gender And Active Statuses<b>",
)

fig.show()

__Findings__

__1. Training data is positively skewed.__

__2. Binary Class Imbalance.__

# Filling Missing Values

In [None]:
train['Credit_Product'] = train['Credit_Product'].fillna(0)
m = train['Credit_Product'].mode()
train['Credit_Product'] = train['Credit_Product'].replace(0,m.iloc[0])

In [None]:
test['Credit_Product'] = test['Credit_Product'].fillna(0)
m = test['Credit_Product'].mode()
test['Credit_Product'] = test['Credit_Product'].replace(0,m.iloc[0])

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

# Feature Engineering

In [None]:
#drop the ID columns from both Train and Test set as it wont have any dependency on the target variable
train.drop("ID",axis=1,inplace=True)
test.drop("ID",axis=1,inplace=True)

In [None]:
# creating a feature if according to occupation customer is active or not
train["Active_occ"]=train["Is_Active"]+"_"+train["Occupation"]
test["Active_occ"]=test["Is_Active"]+"_"+test["Occupation"]


#creating a feature if according to gender customer is active or not
train["Active_gen"]=train["Gender"]+"_"+train["Is_Active"]
test["Active_gen"]=test["Gender"]+"_"+test["Is_Active"]


#creating a feature if according to credit product customer is active or not
train["Active_credit"]=train["Credit_Product"]+"_"+train["Is_Active"]
test["Active_credit"]=test["Credit_Product"]+"_"+test["Is_Active"]


#creating a feature if according to channel customer is active or not
train["Active_channel"]=train["Is_Active"]+"_"+train["Channel_Code"]
test["Active_channel"]=test["Is_Active"]+"_"+test["Channel_Code"]

In [None]:
#Taking the log of Avg Account Balance
train["log_Balance"]=np.log(train["Avg_Account_Balance"])
test["log_Balance"]=np.log(test["Avg_Account_Balance"])

In [None]:
#binning the age to create the age groups feature
bins = [15,30,45,60,75,90]
labels = ['young_adult','adult','middle-aged','old','very_old']
train['Age_group'] = pd.cut(train['Age'],bins = bins, labels = labels)
test['Age_group'] = pd.cut(test['Age'],bins = bins, labels = labels)

In [None]:
(train['Vintage']%365)/7

In [None]:
#feature to convert vintage days to weeks
train['Vintage_weeks'] = ((train['Vintage']%365)/7).astype(int)
test['Vintage_weeks'] = ((test['Vintage']%365)/7).astype(int)

In [None]:
#te = TargetEncoder()
#train['Region_Code_encoding'] = te.fit_transform(train['Region_Code'].astype(str), train['Is_Lead'])
#test['Region_Code_encoding'] = te.transform(test['Region_Code'].astype(str))

In [None]:
#selecting the categorical features
train['Age_group'] = train['Age_group'].astype(object)
test['Age_group'] = test['Age_group'].astype(object)
categoricals_features=[]
for col in train:
  if train[col].dtypes=="O":
    categoricals_features.append(col)

In [None]:
train.shape

In [None]:
categoricals_features

In [None]:
#Label Encoding the Categorical variables

print('Transform all String features to category.\n')
for usecol in categoricals_features:
    colcount = train[usecol].value_counts().index[0]
    train[usecol] = train[usecol].fillna(colcount)
    test[usecol]  = test[usecol].fillna(colcount)
    
    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1
    
    train[usecol] = train[usecol].replace(np.nan, -1).astype('int')
    test[usecol]  = test[usecol].replace(np.nan , -1).astype('int')


In [None]:
train.head()

In [None]:
#Separating the target variable
X= train.loc[:, train.columns != "Is_Lead"]
y = train.loc[:, train.columns == "Is_Lead"]

# Balancing the Dataset

In [None]:
#Upsampling using the SMOTE
#print("Before UpSampling, counts of label '1': {}".format(sum(y.Is_Lead==1)))
#print("Before UpSampling, counts of label '0': {} \n".format(sum(y.Is_Lead==0)))

sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1)   
X_smote, y = sm.fit_resample(X, y)

# Scaling the Dataset

In [None]:
#scaling using standard scaler
#sc = StandardScaler()
#X_smote = sc.fit_transform(X_smote)
#test = sc.fit_transform(test)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
sc = MinMaxScaler(feature_range=(0, 1))
X_smote = sc.fit_transform(X_smote)
test = sc.fit_transform(test)

In [None]:
X_smote = pd.DataFrame(X_smote,columns=X.columns)
test=pd.DataFrame(test,columns=X.columns)

In [None]:
y = pd.DataFrame(y,columns=['Is_Lead'])

In [None]:
X_smote.head()

In [None]:
test.head()

# Model Building

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X_smote, y, test_size=0.2, random_state = 42)

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

In [None]:
def objective(trial):
    
    # params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 150, 3000),
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)

    pred_labels = np.rint(preds)
    # trials will be evaluated based on their accuracy on the test set
    accuracy = sklearn.metrics.accuracy_score(y_valid, pred_labels)
    return accuracy

In [None]:
#hyperparameter tuning using optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=10) 

In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
best_params = study.best_trial.params
best_params['objective'] = 'binary:logistic'

In [None]:
#training the model in 5 fold and 2 repeats
N_FOLDS = 5
N_REPEAT = 2

def training(n_repeat = 1, n_folds = 5):
    models = []
    F1_scores = []
    kfold = StratifiedKFold(n_folds, shuffle = True)
    
    for fold, (train_index, test_index) in enumerate(kfold.split(X_smote,y), 1):
        print('-'*85)
        print(f'Repeat {n_repeat}, Fold {fold}')
        
        X_train = X_smote.values[train_index]
        y_train = y.values[train_index].ravel()
        X_test = X_smote.values[test_index]
        y_test = y.values[test_index].ravel()
        
        model = xgb.XGBClassifier(**best_params)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        f1 = roc_auc_score(y_test,model.predict_proba(X_test)[:,1])
        print(f'AUC: {f1}')
        print(classification_report(y_test, y_pred, labels=[0,1]))
        
        models.append(model)
        F1_scores.append(f1)
    return models, np.mean(F1_scores)

models = []
mean_f1s = []

for i in range(1, N_REPEAT+1):
    m, f = training(i, N_FOLDS)
    print('-'*85)
    models = models + m
    mean_f1s.append(f)

In [None]:
X_smote.shape,test.shape

In [None]:
#prediction of 10 models and taking the mean
pred = np.array([])
pred1=models[0].predict_proba(test.values)[:,1]
pred2=models[1].predict_proba(test.values)[:,1]
pred3=models[2].predict_proba(test.values)[:,1]
pred4=models[3].predict_proba(test.values)[:,1]
pred5=models[4].predict_proba(test.values)[:,1]
pred6=models[5].predict_proba(test.values)[:,1]
pred7=models[6].predict_proba(test.values)[:,1]
pred8=models[7].predict_proba(test.values)[:,1]
pred9=models[8].predict_proba(test.values)[:,1]
pred10=models[9].predict_proba(test.values)[:,1]


for i in range(0,len(test.values)):
    pred = np.append(pred, np.mean([pred1[i], pred2[i], pred3[i], pred4[i], pred5[i],
                                   pred6[i], pred7[i], pred8[i], pred9[i], pred10[i]
                                  ]))

In [None]:
#creating the submission file
test_new = pd.read_csv('/home/divyanka/Downloads/test_mSzZ8RL.csv')
submission_df = pd.DataFrame({'ID': test_new['ID'].values,'Is_Lead': pred})
submission_df.Is_Lead.value_counts()

In [None]:
submission_df.to_csv('/home/divyanka/Downloads/Final_submission.csv',index=False)

# Final Attempt and Final Code

In [None]:
X = train.drop('Is_Lead',axis=1)
y = train['Is_Lead']

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_val(X, y, model, params, folds=3):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        print("-"*50)
    
    return alg

In [None]:
lgb_params= {'learning_rate': 0.094, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 12, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X_smote, y, LGBMClassifier, lgb_params)

In [None]:
xgb_params= {'n_estimators': 20000, 
             'max_depth': 6, 
             'learning_rate': 0.0201, 
             'reg_lambda': 29.326, 
             'subsample': 0.818, 
             'colsample_bytree': 0.235, 
             'colsample_bynode': 0.820, 
             'colsample_bylevel': 0.453}

In [None]:
from xgboost import XGBClassifier
xgb_model = cross_val(X_smote, y, XGBClassifier, xgb_params)

In [None]:
pred_test_lgb = lgb_model.predict_proba(test)[:,1]
pred_test_xgb = xgb_model.predict_proba(test)[:,1]

In [None]:
#creating the submission file
test_new = pd.read_csv('/home/divyanka/Downloads/test_mSzZ8RL.csv')
submission_df = pd.DataFrame({'ID': test_new['ID'].values,'Is_Lead': pred_test_lgb})
submission_df.to_csv('/home/divyanka/Downloads/Final_submission_lgb.csv',index=False)

In [None]:
#creating the submission file
test_new = pd.read_csv('/home/divyanka/Downloads/test_mSzZ8RL.csv')
submission_df = pd.DataFrame({'ID': test_new['ID'].values,'Is_Lead': pred_test_xgb})
submission_df.to_csv('/home/divyanka/Downloads/Final_submission_xgb.csv',index=False)

In [None]:
cat_params= {'n_estimators': 20000, 
                  'depth': 4, 
                  'learning_rate': 0.023, 
                  'colsample_bylevel': 0.655, 
                  'bagging_temperature': 0.921, 
                  'l2_leaf_reg': 10.133}

In [None]:
from catboost import CatBoostClassifier
cat_model = cross_val(X_smote, y, CatBoostClassifier, cat_params)

In [None]:
pred_test_cat = cat_model.predict_proba(test)[:,1]

In [None]:
#creating the submission file
test_new = pd.read_csv('/home/divyanka/Downloads/test_mSzZ8RL.csv')
submission_df = pd.DataFrame({'ID': test_new['ID'].values,'Is_Lead': pred_test_cat})
submission_df.to_csv('/home/divyanka/Downloads/Final_submission_cat.csv',index=False)

In [None]:
#prediction = (pred_test_lgb + pred_test_cat+pred_test_xgb)/3
#creating the submission file
#test_new = pd.read_csv('/home/divyanka/Downloads/test_mSzZ8RL.csv')
#submission_df = pd.DataFrame({'ID': test_new['ID'].values,'Is_Lead': prediction})
#submission_df.to_csv('/home/divyanka/Downloads/Final_submission_lgbm.csv',index=False)