In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import category_encoders as ce   # version 1.2.8
#from functools import partial
from sklearn.datasets import fetch_openml

In [2]:
def load_mtpl2(n_samples=100000):
    # freMTPL2freq dataset from https://www.openml.org/d/41214
    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
    df_freq['IDpol'] = df_freq['IDpol'].astype(int)
    df_freq.set_index('IDpol', inplace=True)
    # freMTPL2sev dataset from https://www.openml.org/d/41215
    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
    # sum ClaimAmount over identical IDs
    df_sev = df_sev.groupby('IDpol').sum()
    df = df_freq.join(df_sev, how="left")
    df["ClaimAmount"].fillna(0, inplace=True)
    # unquote string fields
    for column_name in df.columns[df.dtypes.values == object]:
        df[column_name] = df[column_name].str.strip("'")
    return df.iloc[:n_samples]
  
data = load_mtpl2(n_samples=677990)

  warn("Version {} of dataset {} is inactive, meaning that issues have "
  warn("Version {} of dataset {} is inactive, meaning that issues have "


# This preprocessing step is only used for the Hurdle Model or cascades. 

In [3]:
#les sinistres
data["ClaimNb"] = data["ClaimNb"].clip(upper=4)
data["Exposure"] = data["Exposure"].clip(upper=1)
data["BonusMalus_capped"]= np.clip(data.BonusMalus, None, 150)
data["VehAge_capped"]= np.clip(data.VehAge, None, 30)
data["DrivAge_capped"]= np.clip(data.DrivAge, None, 85)
data['LogDensity'] = np.log(data['Density'])


# marque de la voiture
def VehBrand_class(data):

    Brand = np.array(data["VehBrand"])

    VehBrand_classe = []
    for i in range(0, len(data)):
        
        if Brand[i] in ['B2', 'B4', 'B6', 'B10', 'B1']:
            VehBrand_classe.append('C1')

        elif Brand[i] in ['B3', 'B11', 'B5', 'B13', 'B14']:
            VehBrand_classe.append('C2')

        else:
            VehBrand_classe.append('C4')

    data["VehBrand_class"]=VehBrand_classe
        
    return data

data = VehBrand_class(data)

#la zone d'habitation
def Area_class(data):

    Area = np.array(data["Area"])

    Area_classe = []
    for i in range(0, len(data)):
        
        if Area[i] in ['A', 'B', 'C']:
            Area_classe.append('C1')

        elif Area[i] in ['D']:
            Area_classe.append('C2')

        elif Area[i] in ['E']:
            Area_classe.append('C3')

        else:
            Area_classe.append('C4')

    data["Area_class"]=Area_classe
        
    return data

data = Area_class(data)

#region d'habitation
def Region_class(data):

    Region = np.array(data["Region"])

    Region_classe = []
    for i in range(0, len(data)):
        
        if Region[i] in ['R21', 'R94', 'R11', 'R22', 'R74']:
            Region_classe.append('C1')

        elif Region[i] in ['R91', 'R82', 'R42']:
            Region_classe.append('C2')

        elif Region[i] in ['R93', 'R53']:
            Region_classe.append('C3')

        elif Region[i] in ['R26', 'R25', 'R52', 'R31', 'R54', 'R73', 
                                'R23', 'R72', 'R83', 'R41', 'R43', 'R24']:
            Region_classe.append('C4')

    data["Region_class"]=Region_classe
        
    return data

data = Region_class(data)

data["VehGas_class"] = data["VehGas"]

In [4]:
ce_ord = ce.one_hot.OneHotEncoder(cols = ['Region_class', "Area_class", "VehBrand_class", "VehGas_class"])
data_2 = ce_ord.fit_transform(data, data["ClaimNb"])

  elif pd.api.types.is_categorical(cols):


In [5]:
data_2.head()

Unnamed: 0_level_0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,...,Area_class_1,Area_class_2,Area_class_3,Area_class_4,Region_class_1,Region_class_2,Region_class_3,Region_class_4,VehGas_class_1,VehGas_class_2
IDpol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,...,1,0,0,0,1,0,0,0,1,0
3,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,...,1,0,0,0,1,0,0,0,1,0
5,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,...,0,1,0,0,0,1,0,0,0,1
10,1.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,...,0,1,0,0,0,0,1,0,0,1
11,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,...,0,1,0,0,0,0,1,0,0,1


In [6]:
X_train, X_valid = train_test_split(data_2, test_size=0.15, random_state=58)
X_train.to_csv("data_learn.csv", index = False)
X_valid.to_csv("data_test.csv", index = False)

In [8]:
X_train["ClaimNb"].value_counts()

0.0    547373
1.0     27333
2.0      1504
3.0        67
4.0        14
Name: ClaimNb, dtype: int64

In [9]:
X_valid["ClaimNb"].value_counts()

0.0    96557
1.0     4845
2.0      280
3.0       15
4.0        2
Name: ClaimNb, dtype: int64