In [1]:
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sb
from pylab import rcParams
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
#rcParams['figure.figsize'] = 12,5
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('SampleSubmission.csv')
var = pd.read_csv('VariableDefinitions.csv')

In [3]:
var

Unnamed: 0,Column,Definition
0,ID,Unique person identification number
1,gender,gender
2,age,Individual age
3,race,Individual race
4,dwelling,The dwelling/house the person occupies
5,dwelling_type,"Is the dwelling formal or informal(Yes-formal,..."
6,province_code,Province code
7,metro_code,Metro code
8,psu,Primary sampling unit of the survey
9,nationality,Individual Nationality


In [4]:
train.columns

Index(['ID', 'gender', 'age', 'race', 'dwelling', 'dwelling_type',
       'province_code', 'metro_code', 'psu', 'nationality', 'RTH',
       'marital_st', 'Lang_inside', 'Lang_outside', 'Education', 'lw_work',
       'lw_business', 'help_on_household', 'job_or_business', 'nature_of_work',
       'target'],
      dtype='object')

In [5]:
# Combine train and test set
ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe

all_data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {all_data.shape}')

The shape of the combined dataframe is: (11052, 21)


In [6]:
all_data.head()

Unnamed: 0,ID,gender,age,race,dwelling,dwelling_type,province_code,metro_code,psu,nationality,RTH,marital_st,Lang_inside,Lang_outside,Education,lw_work,lw_business,help_on_household,job_or_business,nature_of_work,target
0,ID_00008683,Female,20,Black African,Dwelling/house or brick/concrete block structu...,Formal,North West,NW - Non Metro,66710095,South Africa,Son/daughter/stepchild/adopted child of person 01,Single and have never been married/never lived...,Xitsonga,Setswana,Grade 11/Standard 9/Form 4,No,No,No,No,Not applicable,0.0
1,ID_00061163,Female,42,Black African,Dwelling/house or brick/concrete block structu...,Formal,Limpopo,LP - Non Metro,98510279,South Africa,Other relative (e.g. in-laws or aunt/uncle) of...,Living together like husband and wife,Sepedi,Sepedi,Grade 12/Standard 10/Form 5/Matric (No Exemption),Yes,No,No,Not applicable,Permanent,0.0
2,ID_00071403,Female,86,Black African,Dwelling/house or brick/concrete block structu...,Formal,Eastern Cape,EC - Non Metro,29210321,South Africa,Father/mother/stepfather/stepmother of person 01,Widowed,IsiXhosa,IsiXhosa,Unspecified,No,No,No,No,Not applicable,0.0
3,ID_00077803,Female,20,Black African,Town house (semi-detached house in complex),Formal,KwaZulu-Natal,KZN - eThekwini,59913860,South Africa,Son/daughter/stepchild/adopted child of person 01,Single and have never been married/never lived...,IsiZulu,IsiZulu,Grade 12/Standard 10/Form 5/Matric (No Exemption),No,No,No,No,Not applicable,0.0
4,ID_00086763,Female,26,Black African,Town house (semi-detached house in complex),Formal,Gauteng,GP - City of Johannesburg,79813828,South Africa,Son/daughter/stepchild/adopted child of person 01,Single and have never been married/never lived...,Setswana,English,Grade 12/Standard 10/Form 5/Matric (No Exemption),No,No,No,No,Not applicable,0.0


In [7]:
all_data.race.value_counts().to_frame()

Unnamed: 0,race
Black African,9014
Coloured,1003
White,835
Indian/Asian,200


In [8]:
all_data.RTH.unique()

array(['Son/daughter/stepchild/adopted child of person 01',
       'Other relative (e.g. in-laws or aunt/uncle) of person 01',
       'Father/mother/stepfather/stepmother of person 01',
       'Head/acting head', 'Husband/wife/partner of person 01',
       'Brother/sister/stepbrother/stepsister of person 01',
       'Grandchild/great grandchild of person 01', 'Non-related persons',
       'Grandparent/great grandparent of person 01', 'Unspecified'],
      dtype=object)

In [9]:
all_data.metro_code.unique()

array(['NW - Non Metro', 'LP - Non Metro', 'EC - Non Metro',
       'KZN - eThekwini', 'GP - City of Johannesburg', 'KZN - Non Metro',
       'GP - Ekurhuleni', 'FS - Mangaung', 'MP - Non Metro',
       'WC - City of Cape Town', 'GP - Non Metro',
       'EC - Nelson Mandela Bay', 'GP - City of Tshwane',
       'WC - Non Metro', 'NC - Non Metro', 'FS - Non Metro',
       'EC - Buffalo City'], dtype=object)

In [10]:
all_data.province_code.unique()

array(['North West', 'Limpopo', 'Eastern Cape', 'KwaZulu-Natal',
       'Gauteng', 'Free State', 'Mpumalanga', 'Western Cape',
       'Northern Cape'], dtype=object)

In [11]:
all_data.dwelling.unique()

array(['Dwelling/house or brick/concrete block structure on a separate stand or yard or farm',
       'Town house (semi-detached house in complex)',
       'Room/flat let on a property or a larger dwelling/servants quarters/granny flat',
       'Informal dwelling/shack in backyard',
       'Flat or apartment in a block of flats',
       'Traditional dwelling/hut/structure made of traditional materials',
       'Dwelling/house/flat/room in backyard',
       'Informal dwelling/shack not in backyard, e.g in an informal/squatter settlement or on a farm.',
       'Unspecified', 'Cluster house in complex', 'Other (specify)',
       'Semi-Detached house', 'Caravan/tent'], dtype=object)

In [12]:
all_data.marital_st.unique()

array(['Single and have never been married/never lived together as husband/wife before',
       'Living together like husband and wife', 'Widowed', 'Married',
       'Divorced',
       'Single; but have been living together with someone as husband/wife before',
       'Separated; but still legally married', 'Unspecified'],
      dtype=object)

In [13]:
rep = {'Unspecified':np.nan, "Other": np.nan}
all_data.nationality.replace(to_replace=rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st              0
Lang_inside             0
Lang_outside            0
Education               0
lw_work                 0
lw_business             0
help_on_household       0
job_or_business         0
nature_of_work          0
target               3316
dtype: int64

In [14]:
all_data.lw_work.unique()

array(['No', 'Yes', 'Unspecified', 'Do not know'], dtype=object)

In [15]:
mar_rep = {'Single and have never been married/never lived together as husband/wife before':"Single", 
       "Living together like husband and wife": "Living",
       "Single; but have been living together with someone as husband/wife before":"Living",
       "Separated; but still legally married":"Married",
       "Unspecified": np.nan}
all_data.marital_st.replace(to_replace=mar_rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                 0
lw_business             0
help_on_household       0
job_or_business         0
nature_of_work          0
target               3316
dtype: int64

In [16]:
rep = {'Unspecified':np.nan, "Do not know": np.nan}
all_data.lw_work.replace(to_replace=rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                14
lw_business             0
help_on_household       0
job_or_business         0
nature_of_work          0
target               3316
dtype: int64

In [17]:
all_data.lw_business.unique()

array(['No', 'Yes', 'Unspecified', 'Do not know'], dtype=object)

In [18]:
lwb_rep = {'Unspecified':np.nan, "Do not know": np.nan}
all_data.lw_business.replace(to_replace=lwb_rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                14
lw_business            14
help_on_household       0
job_or_business         0
nature_of_work          0
target               3316
dtype: int64

In [19]:
all_data.help_on_household.unique()

array(['No', 'Unspecified', 'Yes', 'Do not know'], dtype=object)

In [20]:
hh_rep = {'Unspecified':np.nan, "Do not know": np.nan}
all_data.help_on_household.replace(to_replace=hh_rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                14
lw_business            14
help_on_household      28
job_or_business         0
nature_of_work          0
target               3316
dtype: int64

In [21]:
all_data.job_or_business.unique()

array(['No', 'Not applicable', 'Yes', 'Unspecified', 'Do not know'],
      dtype=object)

In [22]:
jb_rep = {'Unspecified':np.nan, "Do not know": np.nan,"Not applicable":np.nan}
all_data.job_or_business.replace(to_replace=jb_rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                14
lw_business            14
help_on_household      28
job_or_business      4578
nature_of_work          0
target               3316
dtype: int64

In [23]:
all_data.nature_of_work.unique()

array(['Not applicable', 'Permanent', 'Unspecified',
       'A fixed period contract', 'Temporary', 'Casual', 'Seasonal',
       'Do not know'], dtype=object)

In [24]:
jb_rep = {'Unspecified':np.nan, "Do not know": np.nan,"Not applicable":np.nan}
all_data.nature_of_work.replace(to_replace=jb_rep,inplace=True)
all_data.isnull().sum()

ID                      0
gender                  0
age                     0
race                    0
dwelling                0
dwelling_type           0
province_code           0
metro_code              0
psu                     0
nationality           544
RTH                     0
marital_st             27
Lang_inside             0
Lang_outside            0
Education               0
lw_work                14
lw_business            14
help_on_household      28
job_or_business      4578
nature_of_work       6714
target               3316
dtype: int64

In [25]:
all_data.drop(columns=['ID','gender'],inplace=True)
all_data.head()

Unnamed: 0,age,race,dwelling,dwelling_type,province_code,metro_code,psu,nationality,RTH,marital_st,Lang_inside,Lang_outside,Education,lw_work,lw_business,help_on_household,job_or_business,nature_of_work,target
0,20,Black African,Dwelling/house or brick/concrete block structu...,Formal,North West,NW - Non Metro,66710095,South Africa,Son/daughter/stepchild/adopted child of person 01,Single,Xitsonga,Setswana,Grade 11/Standard 9/Form 4,No,No,No,No,,0.0
1,42,Black African,Dwelling/house or brick/concrete block structu...,Formal,Limpopo,LP - Non Metro,98510279,South Africa,Other relative (e.g. in-laws or aunt/uncle) of...,Living,Sepedi,Sepedi,Grade 12/Standard 10/Form 5/Matric (No Exemption),Yes,No,No,,Permanent,0.0
2,86,Black African,Dwelling/house or brick/concrete block structu...,Formal,Eastern Cape,EC - Non Metro,29210321,South Africa,Father/mother/stepfather/stepmother of person 01,Widowed,IsiXhosa,IsiXhosa,Unspecified,No,No,No,No,,0.0
3,20,Black African,Town house (semi-detached house in complex),Formal,KwaZulu-Natal,KZN - eThekwini,59913860,South Africa,Son/daughter/stepchild/adopted child of person 01,Single,IsiZulu,IsiZulu,Grade 12/Standard 10/Form 5/Matric (No Exemption),No,No,No,No,,0.0
4,26,Black African,Town house (semi-detached house in complex),Formal,Gauteng,GP - City of Johannesburg,79813828,South Africa,Son/daughter/stepchild/adopted child of person 01,Single,Setswana,English,Grade 12/Standard 10/Form 5/Matric (No Exemption),No,No,No,No,,0.0


In [26]:
num_data = all_data.select_dtypes(include = ['float64', 'int64']).columns
num_data

Index(['age', 'psu', 'target'], dtype='object')

In [27]:
cat_data = all_data.select_dtypes(exclude = ['float64', 'int64']).columns
cat_data

Index(['race', 'dwelling', 'dwelling_type', 'province_code', 'metro_code',
       'nationality', 'RTH', 'marital_st', 'Lang_inside', 'Lang_outside',
       'Education', 'lw_work', 'lw_business', 'help_on_household',
       'job_or_business', 'nature_of_work'],
      dtype='object')

In [28]:
for col in all_data.columns:
  if col in cat_data:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
  elif col in num_data:
    all_data[col] = all_data[col].fillna(all_data[col].fillna(-9999))

# Confirm that there aren't any missing values
all_data[all_data.columns.difference(['target'])].isna().sum()

Education            0
Lang_inside          0
Lang_outside         0
RTH                  0
age                  0
dwelling             0
dwelling_type        0
help_on_household    0
job_or_business      0
lw_business          0
lw_work              0
marital_st           0
metro_code           0
nationality          0
nature_of_work       0
province_code        0
psu                  0
race                 0
dtype: int64

In [29]:
# Encode categorical features
all_data = pd.get_dummies(data = all_data, columns = cat_data)
all_data.head()

Unnamed: 0,age,psu,target,race_Black African,race_Coloured,race_Indian/Asian,race_White,dwelling_Caravan/tent,dwelling_Cluster house in complex,dwelling_Dwelling/house or brick/concrete block structure on a separate stand or yard or farm,dwelling_Dwelling/house/flat/room in backyard,dwelling_Flat or apartment in a block of flats,dwelling_Informal dwelling/shack in backyard,"dwelling_Informal dwelling/shack not in backyard, e.g in an informal/squatter settlement or on a farm.",dwelling_Other (specify),dwelling_Room/flat let on a property or a larger dwelling/servants quarters/granny flat,dwelling_Semi-Detached house,dwelling_Town house (semi-detached house in complex),dwelling_Traditional dwelling/hut/structure made of traditional materials,dwelling_Unspecified,dwelling_type_Formal,dwelling_type_Informal,dwelling_type_Unspecified,province_code_Eastern Cape,province_code_Free State,province_code_Gauteng,province_code_KwaZulu-Natal,province_code_Limpopo,province_code_Mpumalanga,province_code_North West,province_code_Northern Cape,province_code_Western Cape,metro_code_EC - Buffalo City,metro_code_EC - Nelson Mandela Bay,metro_code_EC - Non Metro,metro_code_FS - Mangaung,metro_code_FS - Non Metro,metro_code_GP - City of Johannesburg,metro_code_GP - City of Tshwane,metro_code_GP - Ekurhuleni,metro_code_GP - Non Metro,metro_code_KZN - Non Metro,metro_code_KZN - eThekwini,metro_code_LP - Non Metro,metro_code_MP - Non Metro,metro_code_NC - Non Metro,metro_code_NW - Non Metro,metro_code_WC - City of Cape Town,metro_code_WC - Non Metro,nationality_South Africa,RTH_Brother/sister/stepbrother/stepsister of person 01,RTH_Father/mother/stepfather/stepmother of person 01,RTH_Grandchild/great grandchild of person 01,RTH_Grandparent/great grandparent of person 01,RTH_Head/acting head,RTH_Husband/wife/partner of person 01,RTH_Non-related persons,RTH_Other relative (e.g. in-laws or aunt/uncle) of person 01,RTH_Son/daughter/stepchild/adopted child of person 01,RTH_Unspecified,marital_st_Divorced,marital_st_Living,marital_st_Married,marital_st_Single,marital_st_Widowed,Lang_inside_Afrikaans,Lang_inside_English,Lang_inside_IsiNdebele,Lang_inside_IsiXhosa,Lang_inside_IsiZulu,"Lang_inside_Khoi, Nama and San languages",Lang_inside_Other (Specify ),Lang_inside_Sepedi,Lang_inside_Sesotho,Lang_inside_Setswana,Lang_inside_SiSwati,Lang_inside_Sign language,Lang_inside_Tshivenda,Lang_inside_Unspecified,Lang_inside_Xitsonga,Lang_outside_Afrikaans,Lang_outside_English,Lang_outside_IsiNdebele,Lang_outside_IsiXhosa,Lang_outside_IsiZulu,"Lang_outside_Khoi, Nama and San languages",Lang_outside_Other (Specify ),Lang_outside_Sepedi,Lang_outside_Sesotho,Lang_outside_Setswana,Lang_outside_SiSwati,Lang_outside_Tshivenda,Lang_outside_Unspecified,Lang_outside_Xitsonga,Education_Bachelor�s Degree,Education_Bachelor�s Degree and post-graduate diploma,Education_Certificate with Grade 12/Std 10,Education_Certificate with less than Grade 12/Std 10,Education_Diploma with Grade 12/Std 10,Education_Diploma with less than Grade 12/Std 10,Education_Do not know,Education_Grade 1/Sub A/Class 1,Education_Grade 10/Standard 8/Form 3,Education_Grade 11/Standard 9/Form 4,Education_Grade 12/Standard 10/Form 5/Matric (Exemption *),Education_Grade 12/Standard 10/Form 5/Matric (No Exemption),Education_Grade 2/Sub B/Class 2,Education_Grade 3/Standard 1/AET 1 (Kha Ri Gude; Sanli),Education_Grade 4/Standard 2,Education_Grade 5/Standard 3/AET 2,Education_Grade 6/Standard 4,Education_Grade 7/Standard 5/AET 3,Education_Grade 8/Standard 6/Form 1,Education_Grade 9/Standard 7/Form 2/AET 4,Education_Grade R/0,Education_Higher Diploma (Technikon/University of Technology),Education_Higher degree (Master's; Doctorate),Education_Honours Degree,Education_N4/NTC 4,Education_N5/NTC 5,Education_N6/NTC 6,Education_NTC 1/N1/NC (V) /Level 2,Education_NTC 2/N2/NC (V)/Level 3,Education_NTC 3/N3/NC (V)/Level 4,Education_Other,Education_Post-Higher Diploma (Technikon/University of Technology; Master's; Doctoral),Education_Unspecified,lw_work_No,lw_work_Yes,lw_business_No,lw_business_Yes,help_on_household_No,help_on_household_Yes,job_or_business_No,job_or_business_Yes,nature_of_work_A fixed period contract,nature_of_work_Casual,nature_of_work_Permanent,nature_of_work_Seasonal,nature_of_work_Temporary
0,20,66710095,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0
1,42,98510279,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0
2,86,29210321,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,1,0,0,0,1,0,0
3,20,59913860,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0
4,26,79813828,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0


In [30]:
# Separate train and test data from the combined dataframe
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]

# Check the shapes of the split dataset
train_df.shape, test_df.shape

((7736, 140), (3316, 140))

In [31]:
X = train_df.drop(["target"], axis=1) # feature columns
y = train_df.target.astype(int) # the label/target column

In [32]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [33]:
#from sklearn.metrics import roc_auc_score

In [65]:
#Catboost 10th submission on the leaderboard
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
model = CatBoostClassifier(learning_rate=0.1, n_estimators=80, depth=4,random_state=1234,l2_leaf_reg=4)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict_proba(X_test)[:, 1]

# Check the auc score of the model
print(f'CAT AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')

# print classification report
#print(classification_report(y_test, [1 if x >= 0.5 else 0 for x in y_pred]))

0:	learn: 0.5946395	total: 6.78ms	remaining: 535ms
1:	learn: 0.5168850	total: 11.4ms	remaining: 443ms
2:	learn: 0.4596473	total: 14.4ms	remaining: 369ms
3:	learn: 0.4146618	total: 17ms	remaining: 322ms
4:	learn: 0.3807705	total: 19.9ms	remaining: 298ms
5:	learn: 0.3552927	total: 22.7ms	remaining: 279ms
6:	learn: 0.3344873	total: 25.2ms	remaining: 263ms
7:	learn: 0.3194597	total: 27.8ms	remaining: 250ms
8:	learn: 0.3076832	total: 30.4ms	remaining: 239ms
9:	learn: 0.2972736	total: 32.9ms	remaining: 230ms
10:	learn: 0.2897062	total: 35.5ms	remaining: 222ms
11:	learn: 0.2840774	total: 38ms	remaining: 216ms
12:	learn: 0.2794691	total: 40.6ms	remaining: 209ms
13:	learn: 0.2753141	total: 43.3ms	remaining: 204ms
14:	learn: 0.2718458	total: 46.4ms	remaining: 201ms
15:	learn: 0.2692220	total: 49.1ms	remaining: 196ms
16:	learn: 0.2667656	total: 51.6ms	remaining: 191ms
17:	learn: 0.2650922	total: 54.1ms	remaining: 186ms
18:	learn: 0.2636955	total: 56.7ms	remaining: 182ms
19:	learn: 0.2627131	total

In [44]:
test_ = test_df.drop('target',axis=1)

In [66]:
pred = model.predict_proba(test_)[:,1]

In [None]:
test_.head()

In [None]:
test_df.head()

In [67]:
#The best submission so far on the leaderboard
sub['target'] = pred
sub.to_csv('cat.csv', index=False)

In [None]:
sub.head()

In [34]:
from xgboost import XGBClassifier

In [37]:
from sklearn.metrics import accuracy_score,  mean_squared_error, classification_report, f1_score, roc_auc_score


In [86]:
# XGB4
xgb = XGBClassifier(learning_rate =0.08,
 booster='dart',
 n_estimators=50,
 max_depth=4,
 min_child_weight=1,
 reg_lambda = 3, 
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train,y_train)
y_pred = xgb.predict_proba(X_test)[:, 1]

# Check the auc score of the model
print(f'XGBoost AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')

XGBoost AUC score on the X_test is: 0.6351867073743541



In [161]:
### XGB4
xgb = XGBClassifier(learning_rate =0.028,
 booster='dart',
 n_estimators=480,
 max_depth=8,
 min_child_weight=6,
 reg_lambda = 5, 
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train,y_train)
y_pred = xgb.predict_proba(X_test)[:, 1]

# Check the auc score of the model
print(f'XGBoost AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')

XGBoost AUC score on the X_test is: 0.6338766733208079



In [162]:
test1 = test.drop(columns=['ID', 'gender'], axis=1)

In [163]:
xgb_pred = xgb.predict_proba(test_)[:,1]

In [164]:
#The best submission so far on the leaderboard
sub['target'] = xgb_pred
sub.to_csv('d4.csv', index=False)

In [None]:

xgb_pred = xgb.predict_proba(test_)[:,1]
sub['target'] = xgb_pred
sub.to_csv('xgb8.csv', index=False)

In [None]:
xgb.get_params()

In [None]:
#Try9 Leaderboard score: 0.0772597130442225
b= pd.read_csv('xgb4.csv')
a= pd.read_csv('xgb2.csv')
#c = pd.read_csv('cat_14th.csv.csv')
sub['target'] = a['target']*0.6 + b['target']*0.4
sub.to_csv('xgb_blend1.csv', index=False)

In [85]:
#Try9 Leaderboard score: 0.0772597130442225
b= pd.read_csv('d2.csv')
a= pd.read_csv('xgb2.csv')
#c = pd.read_csv('cat_14th.csv.csv')
sub['target'] = a['target']*0.6 + b['target']*0.4
sub.to_csv('xgb_blend.csv', index=False)

In [None]:
from sklearn.ensemble import VotingClassifier
#cat= CatBoostClassifier(learning_rate=0.027, iterations=2800, depth=4,random_state=1234,l2_leaf_reg=4)
#model.fit(X_train, y_train)
#XGB4
xgb1 = XGBClassifier(learning_rate =0.1,
 booster='dart',
 n_estimators=100,
 max_depth=4,
 min_child_weight=1,
 reg_lambda = 3, 
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
#gb.fit(X_train,y_train)
#y_pred = model.predict_proba(X_test)[:, 1]
#XGB4
xgb2 = XGBClassifier(learning_rate =0.1,
 booster='dart',
 n_estimators=100,
 max_depth=4,
 min_child_weight=1,
 reg_lambda = 3, 
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
#xgb.fit(X_train,y_train)
#y_pred = model.predict_proba(X_test)[:, 1]

# Check the auc score of the model
#print(f'XGBoost AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')

# Check the auc score of the model
#print(f'XGBoost AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')
#.fit(X_train,y_train)
  
voting_clf = VotingClassifier(estimators=[('xgb1',xgb1),
                                          ('xgb2', xgb2)],
                                           voting='soft', weights=None)
voting_clf = voting_clf.fit(X_train, y_train)
y_pred_voting_clf=voting_clf.predict(X_test)
print("roc: ",roc_auc_score(y_test, y_pred_voting_clf))