In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from scipy import stats

## Loading the data

In [2]:
train_file_path = 'train_qWM28Yl.csv'
test_file_path = 'test_zo1G9sv.csv'

In [3]:
train_df = pd.read_csv(train_file_path)
train_df.shape
test_df = pd.read_csv(test_file_path)
test_df.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0


## Data analysis and visualisation

In [None]:
# plot number of settled and unsettled customers
plt.figure(figsize=(10, 10))
train_df['is_claim'].value_counts().plot(kind='bar')
plt.grid()
for i, v in enumerate(train_df['is_claim'].value_counts()):
    plt.text(i, v, str(v), color='black', fontweight='bold', fontsize=20, horizontalalignment='center')
plt.show()  

In [None]:
train_df.info()

In [None]:
labels = train_df.columns
labels_cor = train_df[labels].corr()
plt.figure(figsize=(15,12))
sns.heatmap(labels_cor, annot=True)

## Data preprocessiong

In [4]:
class DataPreprocessig:
    """ class to handle data preprocessing  for this task
    """
    def __init__(self, df, type = 'train'):
        """ constructor for the class

        Args:
            df (_type_): dataframe to be processed 
            type (str, optional): type of dataframe train/test. Defaults to 'train'.
        """
        self.df = df
        self.type = type
        self.policy_id_list = self.df.policy_id.values.tolist()
        try:
            self.is_claim_list = self.df.is_claim.values.tolist()
        except AttributeError:
            pass
        self.drop_columns()
        self.all_cols = df.columns.tolist()
        self.updated_cols = self.df.columns.tolist()
        self.categorical_cols = self.df.columns[df.dtypes == 'object']
        
        
    def drop_columns(self):
        """ drop columns which are not required for training the model
        """
        self.df.drop(['policy_id'], axis=1, inplace=True)
        self.updated_cols = self.df.columns.tolist()
    
    def enocde_categorical_values(self):
        """ encodes categorical values, if there are only two unique values (yes/no)
            then it is encoded as 1/0, else it is encoded using one hot encoding.
        """
        for col in self.categorical_cols:
            # find if there are only two unique values
            if len(self.df[col].unique()) == 2:
                # for one unique value, assign 0 and for other assign 1
                uniqe_vals = self.df[col].unique()
                self.df[col] = self.df[col].apply(lambda x: 0 if x == uniqe_vals[0] else 1)
            else:
                # one hot encoding
                temp = self.df[col].values
                temp = temp.reshape(-1, 1)
                temp = preprocessing.OneHotEncoder().fit_transform(temp).toarray()
                temp = pd.DataFrame(temp)
                temp.columns = [col + '_' + str(i) for i in range(temp.shape[1])]
                self.df = pd.concat([self.df, temp], axis=1)
                self.df.drop(col, axis=1, inplace=True)
        self.updated_cols = self.df.columns.tolist()
    
    def scale_data(self):
        """ scale the data using standard scaler. scaling is performed only for numerical columns
            not for categorical columns.
        """
        # scale columns whose values are not in range 0 to 1
        scale = StandardScaler()
        try:
            for col in self.df.columns:
                # if there are more than two unique values, then scale the data
                if len(self.df[col].unique()) > 2:
                    self.df[col] = scale.fit_transform(self.df[col].values.reshape(-1, 1))
            self.updated_cols = self.df.columns.tolist()
        except KeyError:
            pass

    def checknull(self):
        """ check if there are any null values in the dataframe
        and raises an exception if there are any null values"""
        if self.df.isnull().sum().sum() > 0:
            raise Exception('Null values present in the data')
        
    def remove_outliers(self):
        """ removes outliers from the dataframe using z score, if the z score is greater than 3
            then the row is removed from the dataframe and is considered as an outlier.
            This step is performed only for numerical columns not for categorical columns.
        """
        # add policy id and is_claim to the dataframe
        self.df['policy_id'] = self.policy_id_list
        # remove outliers
        for col in self.df.columns:
            if len(self.df[col].unique()) > 2:
                # calculate z score and remove outliers
                try:
                    z = np.abs(stats.zscore(self.df[col]))
                    self.df = self.df[(z < 3)]
                except TypeError:
                    pass
        self.updated_cols = self.df.columns.tolist()
        
    def apply_pca(self, method='full'):
        """ apply pca to the dataframe and reduce the number of columns.
            This step is performed only for numerical columns not for categorical columns.
            
        Args:
            method (str, optional): method to be used for pca. Defaults to 'full'.
                                    if method is 'full' then all the columns are used for pca
                                    if method is 'categorical' then only categorical columns are used for pca
                                    if method is 'numerical' then only numerical columns are used for pca
        """
        if method == 'full':
            # apply pca
            pca = PCA(n_components=0.95,svd_solver='full')
            self.df = pca.fit_transform(self.df)
            self.df = pd.DataFrame(self.df)
            self.updated_cols = self.df.columns.tolist()
            
        if method == 'categorical':
            cat_cols = []
            for col in self.df.columns:
                if len(self.df[col].unique()) <= 2:
                    if not col == 'is_claim':
                        cat_cols.append(col)
            pca = PCA(n_components=0.95,svd_solver='full')
            pca_output = pca.fit_transform(self.df[cat_cols])
            pca_output = pd.DataFrame(pca_output)
            pca_output.columns = ['cat_' + str(i) for i in range(pca_output.shape[1])]
            self.df = pd.concat([self.df, pca_output], axis=1)
            self.updated_cols = self.df.columns.tolist()
            if self.type == 'train':
                self.df['is_claim'] = self.is_claim_list
    
        if method == 'numerical':
            pca_cols = []
            for col in self.df.columns:
                if len(self.df[col].unique()) > 2:
                    pca_cols.append(col)
            pca = PCA(n_components=0.95,svd_solver='full')
            pca_outp = pca.fit_transform(self.df[pca_cols])
            pca_outp = pd.DataFrame(pca_outp)
            pca_outp.columns = ['pca_' + str(i) for i in range(pca_outp.shape[1])]
            # remove pca columns from original dataframe
            self.df.drop(pca_cols, axis=1, inplace=True)
            self.df = pd.concat([self.df, pca_outp], axis=1)
            self.updated_cols = self.df.columns.tolist()
        
    def get_processed_data(self):
        """ wrapper function to perform all the preprocessing steps

        Returns:
            pd.DataFrame: dataframe
        """
        self.enocde_categorical_values()
        self.scale_data()
        self.checknull()
        #self.apply_pca(method='categorical') # no improvement in accuracy
        if self.type == 'train':
            self.remove_outliers()
        else:
            self.df['policy_id'] = self.policy_id_list
        
        self.updated_cols = self.df.columns.tolist()
        return self.df
    

In [5]:
# # creating processed train data
pp = DataPreprocessig(train_df)

pp_train_df = pp.get_processed_data()

policy_id_list = pp_train_df.policy_id.values.tolist()
# drop policy id column
pp_train_df.drop(['policy_id'], axis=1, inplace=True)
pp_train_df.head()



Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,...,engine_type_4,engine_type_5,engine_type_6,engine_type_7,engine_type_8,engine_type_9,engine_type_10,steering_type_0,steering_type_1,steering_type_2
0,-0.230283,-0.342447,1.422557,-0.783513,-0.671712,-0.620458,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.148188,-0.871359,-0.768362,0.462975,-0.671712,-0.620458,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.555022,-0.871359,-0.690115,-0.835268,-0.671712,-0.620458,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.697883,0.715378,-0.298879,0.158275,-0.671712,-0.620458,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.03584,0.715378,1.34431,0.900969,0.207812,-0.620458,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# creating processed test data 
pp_test = DataPreprocessig(test_df, type='test')

pp_test_df = pp_test.get_processed_data()

policy_id_list = pp_test_df.policy_id.values.tolist()
# drop policy id column
pp_test_df.drop(['policy_id'], axis=1, inplace=True)
pp_test_df.head()

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,...,engine_type_4,engine_type_5,engine_type_6,engine_type_7,engine_type_8,engine_type_9,engine_type_10,steering_type_0,steering_type_1,steering_type_2
0,-0.649573,-1.22482,0.962158,-0.835059,-0.673119,-0.622425,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.732682,1.061771,-0.219379,-0.567777,-0.673119,-0.622425,0,1,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.682845,0.885879,-0.14061,-0.567777,0.20864,-0.622425,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.412892,-1.048929,-0.061841,0.901995,-0.673119,-0.622425,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.499038,-0.873037,1.356003,0.901995,-0.673119,-0.622425,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Train and test the model

In [7]:
# create submission file
def create_submission_file(model, test_df, policy_id_list, file_name):
    """ creates submission file for the given model and test data as per the
        format required for the competition.

    Args:
        model (_type_): model which is used for prediction
        test_df (_pd.DataFrame): test data. pandas dataframe
        policy_id_list (List): list of policy ids
        file_name (Str): name of the file to be created.
                        Default value is 'submission.csv'

    Returns:
        pd.DataFrame: sumbission dataframe
    """
    y_pred = model.predict(test_df)
    submission_df = pd.DataFrame({'policy_id': policy_id_list, 'is_claim': y_pred})
    submission_df.to_csv(file_name, index=False)
    return submission_df


In [22]:
# create random forest model
from sklearn.ensemble import RandomForestClassifier

# hyperparameter tuning using grid search
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
# }   

# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = -1, verbose = 2)
# grid_search.fit(pp_train_df.drop('is_claim', axis=1), pp_train_df['is_claim'])

# # find best parameters
# grid_search.best_params_


# hyperparameters are found using grid search and are used to train the model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=123, class_weight='balanced')
rf.fit(pp_train_df.drop(['is_claim'], axis=1), pp_train_df['is_claim'])




# create submission file
submission_df = create_submission_file(rf, pp_test_df, policy_id_list, 'submission.csv')
submission_df.is_claim.value_counts()

0    24962
1    14101
Name: is_claim, dtype: int64

In [17]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=0, class_weight='balanced')
dt.fit(pp_train_df.drop(['is_claim'], axis=1), pp_train_df['is_claim'])

# # hyperparameter tuning using grid search
# from sklearn.model_selection import GridSearchCV
# # grid search for decision tree
# param_grid = {
#     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
# }

# grid_search = GridSearchCV(estimator = dt, param_grid = param_grid, cv = 2, n_jobs = -1, verbose = 2)
# grid_search.fit(pp_train_df.drop('is_claim', axis=1), pp_train_df['is_claim'])

# # find best parameters
# grid_search.best_params_





submission_df = create_submission_file(dt, pp_test_df,policy_id_list, 'submission.csv')
submission_df.is_claim.value_counts()

1    22417
0    16646
Name: is_claim, dtype: int64