In [16]:
# importing necessary modules
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
train = pd.read_csv('/content/drive/MyDrive/vahan_bima/train_BRCpofr.csv')
train.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400
2,3,Male,Urban,Bachelor,5L-10L,1,8,2599,More than 1,A,Platinum,64212
3,4,Female,Rural,High School,5L-10L,0,7,0,More than 1,A,Platinum,97920
4,5,Male,Urban,High School,More than 10L,1,6,3508,More than 1,A,Gold,59736


In [19]:
def process_df(df):

   
    
  mapped_qualification={
                  'High School':1,
                'Bachelor':2,
                  'Others':0
              }
    
  mapped_num_policies={
    '1':1,
    'More than 1':2,
    }
  mapped_income={
    '<=2L':0,
    '2L-5L':1,
    '5L-10L':2,
    'More than 10L':3
    }
    
  df['income']=df['income'].map(mapped_income)
  df['num_policies']=df['num_policies'].map(mapped_num_policies)
    
  df['qualification']=df['qualification'].map(mapped_qualification)
    
  df=df[['gender', 'area', 'qualification', 'income', 'marital_status',
       'vintage', 'claim_amount', 'num_policies', 'policy', 'type_of_policy']]
    
  return df

In [20]:
train.columns

Index(['id', 'gender', 'area', 'qualification', 'income', 'marital_status',
       'vintage', 'claim_amount', 'num_policies', 'policy', 'type_of_policy',
       'cltv'],
      dtype='object')

In [21]:
X=process_df(train)
X=pd.get_dummies(X)
Y=train['cltv']


In [22]:
X

Unnamed: 0,qualification,income,marital_status,vintage,claim_amount,num_policies,gender_Female,gender_Male,area_Rural,area_Urban,policy_A,policy_B,policy_C,type_of_policy_Gold,type_of_policy_Platinum,type_of_policy_Silver
0,2,2,1,5,5790,2,0,1,0,1,1,0,0,0,1,0
1,1,2,0,8,5080,2,0,1,1,0,1,0,0,0,1,0
2,2,2,1,8,2599,2,0,1,0,1,1,0,0,0,1,0
3,1,2,0,7,0,2,1,0,1,0,1,0,0,0,1,0
4,1,3,1,6,3508,2,0,1,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89387,1,3,0,6,0,2,0,1,1,0,0,1,0,0,1,0
89388,2,2,1,0,3897,2,0,1,0,1,1,0,0,0,1,0
89389,2,2,0,4,3363,2,1,0,0,1,0,1,0,0,1,0
89390,1,1,0,8,0,2,1,0,1,0,0,1,0,0,1,0


In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, random_state=22, test_size=0.2)
model=xgboost.XGBRegressor(booster='gbtree')
model.fit(X_train,y_train)
simple_pred=model.predict(X_test)
print('Mean Squared Error:',mse(y_test,simple_pred))
print('R squared:',r2_score(y_test,simple_pred))

Mean Squared Error: 6859822439.514009
R squared: 0.15793845355310598


In [24]:
model.feature_importances_

array([5.0436622e-03, 9.0925107e-03, 1.6858403e-02, 2.6763463e-03,
       2.5970906e-02, 7.7723056e-01, 6.9769216e-03, 0.0000000e+00,
       9.1292039e-02, 0.0000000e+00, 3.6147428e-03, 2.7698429e-02,
       2.2462741e-02, 6.8957508e-03, 6.5699319e-04, 3.5299829e-03],
      dtype=float32)

In [26]:
model.get_xgb_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'nthread': 1,
 'objective': 'reg:linear',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'subsample': 1,
 'verbosity': 1}

In [27]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(xgboost.XGBRegressor(booster='gbtree',objective= 'reg:squarederror',n_jobs = 16), 
      {
     
     
     'learning_rate': [0.1105,0.111],
                       
      'n_estimators': [86,90],
     'max_depth': [3,4,5], 
     'colsample_bylevel':[0.5],
     'subsample':[0.75],
      
     },
      cv=10)
clf.fit(X,Y)
clf.cv_results_

{'mean_fit_time': array([3.38162937, 3.6120842 , 4.22686529, 4.37205324, 5.1618809 ,
        5.39273748, 3.39617033, 3.5350389 , 4.22348971, 4.41623185,
        5.15759826, 5.40889702]),
 'std_fit_time': array([0.04920899, 0.23720578, 0.02876426, 0.0339276 , 0.0612122 ,
        0.05095527, 0.0324104 , 0.02430348, 0.04755237, 0.03744146,
        0.03518466, 0.04632856]),
 'mean_score_time': array([0.02141674, 0.02174177, 0.02573311, 0.02677021, 0.03051593,
        0.03092589, 0.02077148, 0.02070546, 0.02555971, 0.02590175,
        0.02981527, 0.03214238]),
 'std_score_time': array([0.00090859, 0.00073992, 0.00170114, 0.00343242, 0.00164499,
        0.00064996, 0.00179274, 0.00068218, 0.00199323, 0.00062848,
        0.00056886, 0.00200383]),
 'param_colsample_bylevel': masked_array(data=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
                    0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False

In [28]:
clf.best_params_

{'colsample_bylevel': 0.5,
 'learning_rate': 0.111,
 'max_depth': 3,
 'n_estimators': 86,
 'subsample': 0.75}

In [None]:
#0.15793845355310598

In [29]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, random_state=22, test_size=0.2)
model=xgboost.XGBRegressor(booster='gbtree', colsample_bylevel= 0.5,
 learning_rate= 0.111,
 max_depth= 3,
 n_estimators=86,
 objective= 'reg:squarederror',
 subsample= 0.75)
model.fit(X_train,y_train)
simple_pred=model.predict(X_test)
print('Mean Squared Error:',mse(y_test,simple_pred))
print('R squared:',r2_score(y_test,simple_pred))

Mean Squared Error: 6852353228.783059
R squared: 0.15885531914172712


In [30]:
model_xgb=xgboost.XGBRegressor(booster='gbtree', colsample_bylevel= 0.5,
 learning_rate= 0.111,
 max_depth= 3,
 n_estimators=86,
 objective= 'reg:squarederror',
 subsample= 0.75)
model_xgb.fit(X,Y)


XGBRegressor(colsample_bylevel=0.5, learning_rate=0.111, n_estimators=86,
             objective='reg:squarederror', subsample=0.75)

In [31]:
df_test=pd.read_csv('/content/drive/MyDrive/vahan_bima/test_koRSKBP.csv')
X_test=process_df(df_test)
X_test=pd.get_dummies(X_test)
y_pred=model_xgb.predict(X_test)
df_test['cltv']=y_pred

In [32]:
df_sub=df_test[['id','cltv']] 
df_sub.to_csv('xgb_final_submission.csv',index=False)

In [33]:
files.download('xgb_final_submission.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>