In [20]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [21]:
# loading training file
df_train = pd.read_csv('train_BRCpofr.csv')
df_train.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400
2,3,Male,Urban,Bachelor,5L-10L,1,8,2599,More than 1,A,Platinum,64212
3,4,Female,Rural,High School,5L-10L,0,7,0,More than 1,A,Platinum,97920
4,5,Male,Urban,High School,More than 10L,1,6,3508,More than 1,A,Gold,59736


In [22]:
# splitting the dataset into train and test split
X_train,X_test,y_train,y_test=train_test_split(df_train.drop('cltv', axis=1),df_train['cltv'],random_state=22, test_size=0.2)

In [23]:
# Making a base line prediction taking mean of the training dataset and providing it as prediction for all test cases

pred_cltv=np.zeros(y_test.shape[0])
pred_cltv.fill(y_train.mean())

In [24]:
mse(pred_cltv,y_test)

8146496381.163936

#Baseline MSE Error: 8146496381.16

In [25]:
# Defining a function to check benchmark models perfomance depending on a column
def check_by_single_feature(col):
    train=pd.concat([X_train,y_train], axis=1)
    col_type=pd.pivot_table(train, values='cltv', index = [col], aggfunc=['mean','median','count'])
    print(col_type)
    
    col_type_mean=np.zeros(X_test.shape[0])
    col_type_mean=list(map(lambda x:col_type.loc[x][( 'mean', 'cltv')], X_test[col]))
    
    print('\n',col,'type Root Mean Squared error: {}\n'.format(mse(y_test,col_type_mean)**0.5))
    
    

In [26]:
cat_cols=['gender', 'area', 'qualification', 'income', 'marital_status',
          'vintage', 'num_policies', 'policy', 'type_of_policy']

In [27]:
for col in cat_cols:
    check_by_single_feature(col)

                mean median  count
                cltv   cltv   cltv
gender                            
Female  98451.456234  67122  31234
Male    97631.959483  65772  40279

 gender type Root Mean Squared error: 90263.48597867311

                mean median  count
                cltv   cltv   cltv
area                              
Rural   79735.607713  63888  21627
Urban  105903.630117  67548  49886

 area type Root Mean Squared error: 89425.89211428036

                       mean median  count
                       cltv   cltv   cltv
qualification                            
Bachelor       98758.507066  66858  31420
High School    99006.560837  66360  37091
Others         77383.690873  60636   3002

 qualification type Root Mean Squared error: 90168.83479786519

                        mean median  count
                        cltv   cltv   cltv
income                                    
2L-5L          109628.005626  68262  17064
5L-10L          94858.800428  65772  42035
<=2L

In [28]:
df_train.columns

Index(['id', 'gender', 'area', 'qualification', 'income', 'marital_status',
       'vintage', 'claim_amount', 'num_policies', 'policy', 'type_of_policy',
       'cltv'],
      dtype='object')

In [29]:
imp_cols=['area', 'gender','qualification', 'income', 'marital_status',
          'vintage', 'num_policies', 'policy', 'type_of_policy','claim_amount']

In [30]:
def process_df(df):
    mapped_area={
    'Rural':1,
    'Urban':2
    }
    mapped_qualification={
    'High School':2,
    'Bachelor':3,
    'Others':1
    }
    
   
    mapped_num_policies={
    '1':0,
    'More than 1':1,
    }
    mapped_income={
    '<=2L':0,
    '2L-5L':1,
    '5L-10L':2,
    'More than 10L':3
    }
    
    df['income']=df['income'].map(mapped_income)
    df['num_policies']=df['num_policies'].map(mapped_num_policies)
    
    df['qualification']=df['qualification'].map(mapped_qualification)
    df['area']=df['area'].map(mapped_area)
    df=df[imp_cols]
    
    return df

In [31]:
X=process_df(df_train)
X=pd.get_dummies(X)

y=df_train['cltv']

# splitting the dataset into train and test split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=22, test_size=0.2)

# Using Linear regression 

In [32]:
linear_model=LinearRegression()

In [33]:
linear_model.fit(X_train,y_train)

In [34]:
y_pred=linear_model.predict(X_test)

In [35]:
r2_score(y_test,y_pred)

0.14950573868473715

In [36]:
df_test=pd.read_csv('test_koRSKBP.csv')
X_test=process_df(df_test)
X_test=pd.get_dummies(X_test)
y_pred=linear_model.predict(X_test)

In [37]:
df_test['cltv']=y_pred

In [38]:
df_sub=df_test[['id','cltv']]
df_sub.to_csv('linear_regression_final.csv',index=False)