In [None]:
#!pip install keras==2.12.0


In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import openai

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, classification_report, mean_squared_error
from sklearn.calibration import CalibratedClassifierCV, calibration_curve


In [None]:
#Set parameters to see all data
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [None]:
# Read dataset

%%time
df_path = 'Enrian DS Assignment Customer Descriptions.xlsx'

try:
    df_desc = pd.read_excel(df_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



df_desc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           10 non-null     int64 
 1   Name         10 non-null     object
 2   Description  10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes
CPU times: user 507 ms, sys: 48.2 ms, total: 555 ms
Wall time: 764 ms


In [None]:
df_desc.head()

Unnamed: 0,ID,Name,Description
0,191150,Julia,"Meet Julia, a 34-year-old woman living in the ..."
1,76655,Carlos,Carlos is a 52-year-old man who prides himself...
2,98173,Sophie,"At 28 years old, Sophie finally steps into car..."
3,906632,Victor,"Victor, aged 45, has always been a keen observ..."
4,803321,Emily,"Emily, a 29-year-old professional from region ..."


In [None]:
text_df = df_desc.copy()

In [None]:
# Function to query the GPT chat model
def extract_with_chat_model(description):
    prompt = f"""
    Extract the following features from this customer description:
    1. Gender
    2. Age
    3. Driving License
    4. Region Code
    5. Previously Insured
    6. Vehicle Age
    7. Vehicle Damage
    8. Annual Premium
    9. Policy Sales Channel
    10. Tenure
    11. Response

    Description: {description}
    """

    # Query the OpenAI API using the `v1/chat/completions` endpoint
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an assistant that extracts structured data from text descriptions."},
            {"role": "user", "content": prompt}
        ]
    )

    # Parse the response
    response_text = response.choices[0].message['content'].strip()
    features = {}
    for line in response_text.splitlines():
        if ':' in line:
            key, value = map(str.strip, line.split(':', 1))
            features[key.lower().replace(' ', '_')] = value
    return features


In [None]:
# Extract features for each description using GPT-4
extracted_features = df_desc['Description'].apply(extract_with_chat_model)
extracted_df = pd.DataFrame(extracted_features.tolist())
extracted_df.insert(0, 'ID', df_desc['ID'])
extracted_df.insert(1, 'Name', df_desc['Name'])

In [None]:
extracted_df.head()

Unnamed: 0,ID,Name,1._gender,2._age,3._driving_license,4._region_code,5._previously_insured,6._vehicle_age,7._vehicle_damage,8._annual_premium,9._policy_sales_channel,10._tenure,11._response
0,191150,Julia,Female,34,No,410,No,4 years,No,"$1,250",88,60 days,Not specified.
1,76655,Carlos,Man,52,Has driving license for over 30 years,302,Yes,10 years,"Yes, has minor scrapes and accidents","$2,000",45,250 days,Not specified
2,98173,Sophie,Female,28,Yes,118,No,1 year,No,"$1,100",12,30 days,Not mentioned
3,906632,Victor,Male,45,Yes (He has a driver's license for over 25 years),207,No (He only just acquired vehicle insurance),3 years,No (His vehicle is in immaculate condition wit...,"$1,500",Channel 90,10 days (Insurance vintage is barely 10 days old),Not Provided
4,803321,Emily,Female,29,Yes,336,Not specified,5 years old,Yes (significant accident last year),"$1,350",33,Over 100 days,Not specified


In [None]:
# Read dataset

%%time
data_mart_path = 'Enrian_DS_Assignment_Data.csv'

try:
    data_mart_df = pd.read_csv(data_mart_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



data_mart_df.shape

CPU times: user 566 ms, sys: 243 ms, total: 809 ms
Wall time: 857 ms


(381109, 12)

In [None]:
# Method for reducing the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
extracted_df.columns

Index(['ID', 'Name', '1._gender', '2._age', '3._driving_license', '4._region_code', '5._previously_insured', '6._vehicle_age', '7._vehicle_damage', '8._annual_premium', '9._policy_sales_channel', '10._tenure', '11._response'], dtype='object')

In [None]:
def remove_prefixes(columns):
    # Define a list to store the final column names
    final_columns = []

    # Iterate over the input columns
    for column in columns:
        # Split the column name by '.' and get the last part
        parts = column.split('.')
        final_column = parts[-1]

        # Remove leading underscores from the final column name
        final_column = final_column.lstrip('_')

        # Append the final column name to the list
        final_columns.append(final_column)

    return final_columns


In [None]:
# Drop unused parts
final_columns = remove_prefixes(extracted_df.columns)
print(final_columns)


['ID', 'Name', 'gender', 'age', 'driving_license', 'region_code', 'previously_insured', 'vehicle_age', 'vehicle_damage', 'annual_premium', 'policy_sales_channel', 'tenure', 'response']


In [None]:
final_columns

['ID',
 'Name',
 'gender',
 'age',
 'driving_license',
 'region_code',
 'previously_insured',
 'vehicle_age',
 'vehicle_damage',
 'annual_premium',
 'policy_sales_channel',
 'tenure',
 'response']

In [None]:
# Rename the columns
extracted_df.rename(columns=dict(zip(extracted_df.columns, final_columns)), inplace=True)
extracted_df.head()

Unnamed: 0,ID,Name,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,tenure,response
0,191150,Julia,Female,34,No,410,No,4 years,No,"$1,250",88,60 days,Not specified.
1,76655,Carlos,Man,52,Has driving license for over 30 years,302,Yes,10 years,"Yes, has minor scrapes and accidents","$2,000",45,250 days,Not specified
2,98173,Sophie,Female,28,Yes,118,No,1 year,No,"$1,100",12,30 days,Not mentioned
3,906632,Victor,Male,45,Yes (He has a driver's license for over 25 years),207,No (He only just acquired vehicle insurance),3 years,No (His vehicle is in immaculate condition wit...,"$1,500",Channel 90,10 days (Insurance vintage is barely 10 days old),Not Provided
4,803321,Emily,Female,29,Yes,336,Not specified,5 years old,Yes (significant accident last year),"$1,350",33,Over 100 days,Not specified


In [None]:
# Replace uppercase characters of dataframe column names with lowercase
data_mart_df.columns = data_mart_df.columns.str.lower()
extracted_df.columns = extracted_df.columns.str.lower()

In [None]:
extracted_df['gender'].value_counts()


gender
Female    5
Male      4
Man       1
Name: count, dtype: int64

In [None]:
# Create a new column 'is_female' with boolean values
data_mart_df['is_female'] = data_mart_df['gender'] == 'Female'
extracted_df['is_female'] = extracted_df['gender'] == 'Female'
extracted_df['is_female'] = extracted_df['gender'] == 'Woman'


# Convert boolean values to 1 for True and 0 for False
data_mart_df['is_female'] = data_mart_df['is_female'].astype(int)
extracted_df['is_female'] = extracted_df['is_female'].astype(int)

# Drop the 'gender' column from the DataFrame
data_mart_df.drop(columns=['gender'], inplace=True)
extracted_df.drop(columns=['gender'], inplace=True)

In [None]:
def update_vehicle_damage(value):
    # Check if 'Yes' is present in the value
    if 'Yes' in value:
        return 'Yes'
    # If not 'Yes', set as 'No'
    else:
        return 'No'

In [None]:
# Apply the custom function to update the 'vehicle_damage' column
extracted_df['vehicle_damage'] = extracted_df['vehicle_damage'].apply(update_vehicle_damage)

# Print the updated value counts
print(extracted_df['vehicle_damage'].value_counts())



vehicle_damage
No     7
Yes    3
Name: count, dtype: int64


In [None]:
# Create a new column 'is_damage' with boolean values
data_mart_df['is_damage'] = data_mart_df['vehicle_damage'] == 'Yes'
extracted_df['is_damage'] = extracted_df['vehicle_damage'] == 'Yes'

# Convert boolean values to 1 for True and 0 for False
data_mart_df['is_damage'] = data_mart_df['is_damage'].astype(int)
extracted_df['is_damage'] = extracted_df['is_damage'].astype(int)

# Drop the 'vehicle_damage' column from the DataFrame
data_mart_df.drop(columns=['vehicle_damage'], inplace=True)
extracted_df.drop(columns=['vehicle_damage'], inplace=True)

In [None]:
extracted_df.vehicle_age.value_counts()

vehicle_age
3 years        2
4 years        1
10 years       1
1 year         1
5 years old    1
2 years        1
15 years       1
New            1
8 years        1
Name: count, dtype: int64

In [None]:
# Define a mapping dictionary
mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2, '3 years': 2, '4 years': 2,
           '10 years': 2, '5 years': 2, '15 years': 2, '8 years': 2, '2 years': 1, 'New': 0,
           '1 years': 1}

# Map the values in the 'vehicle_age' column to integers
data_mart_df['vehicle_age'] = data_mart_df['vehicle_age'].map(mapping)
extracted_df['vehicle_age'] = extracted_df['vehicle_age'].map(mapping)


# Display the updated DataFrame
data_mart_df.head()

Unnamed: 0,id,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,response,is_female,is_damage
0,1,44,1,28,0,2,487,26,217,1,0,1
1,2,76,1,3,0,1,404,26,183,0,0,0
2,3,47,1,28,0,2,461,26,27,1,0,1
3,4,21,1,11,1,0,345,152,203,0,0,0
4,5,29,1,41,1,0,331,152,39,0,1,0


In [None]:
def update_driver_license(value):
    # Check if 'No' is present in the value
    if 'No' in value:
        return '0'
    # Check if 'Not' is present in the value
    elif 'Not' in value:
        return '0'
    # If neither 'No' or 'Not, set as '1'
    else:
        return '1'

In [None]:
# Apply the custom function to update the 'vehicle_damage' column
extracted_df['driving_license'] = extracted_df['driving_license'].apply(update_driver_license)

# Print the updated value counts
print(extracted_df['driving_license'].value_counts())

driving_license
1    9
0    1
Name: count, dtype: int64


In [None]:
"""def replace_cat_info_to_count(df, col):
    # Create a dictionary with the counts of each region code
    count_dict = df[col].value_counts().to_dict()

    # Replace region code with repeating number of that region code
    df[col] = df[col].map(count_dict)

    return df, count_dict"""

def replace_cat_info_to_count(df1, df2, df3, col):
    # Concatenate the two dataframes along rows
    combined_df = pd.concat([df1, df2, df3], axis=0)

    # Create a dictionary with the counts of each category in the combined dataframe
    count_dict = combined_df[col].value_counts().to_dict()

    # Replace category with the count of occurrences in the combined dataframe
    df1[col] = df1[col].map(count_dict)
    df2[col] = df2[col].map(count_dict)
    df3[col] = df3[col].map(count_dict)

    df1[col] = scaler.fit_transform(df1[col].values.reshape(-1, 1))
    df2[col] = scaler.fit_transform(df2[col].values.reshape(-1, 1))
    df2[col] = scaler.fit_transform(df2[col].values.reshape(-1, 1))



    return df1, df2, df3


In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Method for normalization to numeric columns
def normalize_data(dataframe, columns):

    normalized_dataframe = dataframe.copy()

    # Initialize MinMaxScaler
    #scaler = MinMaxScaler()

    # Check if the specified columns exist in the DataFrame
    existing_columns = [col for col in columns if col in normalized_dataframe.columns]

    if existing_columns:
        # Fit and transform the specified columns using MinMaxScaler
        normalized_dataframe[existing_columns] = scaler.fit_transform(normalized_dataframe[existing_columns])

    return normalized_dataframe

In [None]:
# Calculate the mean of the 'vehicle_age' column
vehicle_age_mean = extracted_df['vehicle_age'].mean()

# Fill missing values with the mean
extracted_df['vehicle_age'].fillna(vehicle_age_mean, inplace=True)

In [None]:
# Remove the dollar sign and comma from the values and convert them to integers
extracted_df['annual_premium'] = extracted_df['annual_premium'].str.replace('$', '').str.replace(',', '').astype(int)

In [None]:
# Define a function to convert tenure values to numeric
def convert_tenure(value):
    if 'Over' in value:
        return 365
    elif 'days' in value:
        return int(value.split()[0])
    else:
        return np.nan  # Handle unexpected cases as needed

# Apply the function to the tenure column
extracted_df['tenure'] = extracted_df['tenure'].apply(convert_tenure)


In [None]:
get_scaler_cols = ['vehicle_age', 'annual_premium', 'tenure']

data_mart_df = normalize_data(data_mart_df, get_scaler_cols)
extracted_df = normalize_data(extracted_df, get_scaler_cols)

data_mart_df[get_scaler_cols].head()

Unnamed: 0,vehicle_age,annual_premium,tenure
0,1.0,0.070259,0.716263
1,0.5,0.057443,0.598616
2,1.0,0.066245,0.058824
3,0.0,0.048332,0.66782
4,0.0,0.04617,0.100346


In [None]:
# Extract integer values from the 'age' column
extracted_df['age'] = extracted_df['age'].str.extract(r'(\d+)').astype(float)

In [None]:
def update_previously_insured(value):
    # Check if 'Yes' is present in the value
    if 'Yes' in value:
        return 1
    # If not 'Yes', set as 'No'
    else:
        return 0

# Apply the custom function to update the 'previously_insured' column
extracted_df['previously_insured'] = extracted_df['previously_insured'].apply(update_previously_insured)

In [None]:

# Remove non-numeric characters from the 'policy_sales_channel' column
extracted_df['policy_sales_channel'] = extracted_df['policy_sales_channel'].str.replace(r'\D', '', regex=True)

In [None]:
extracted_df.previously_insured.unique()

array([0, 1])

In [None]:
extracted_df.head(10)

Unnamed: 0,id,name,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,response,is_female,is_damage
0,191150,Julia,34.0,0,410,0,1.0,0.24,88,0.140845,Not specified.,0,0
1,76655,Carlos,52.0,1,302,1,1.0,0.84,45,0.676056,Not specified,0,1
2,98173,Sophie,28.0,1,118,0,0.8125,0.12,12,0.056338,Not mentioned,0,0
3,906632,Victor,45.0,1,207,0,1.0,0.44,90,0.0,Not Provided,0,0
4,803321,Emily,29.0,1,336,0,0.8125,0.32,33,1.0,Not specified,0,1
5,639179,Tom,39.0,1,256,1,0.5,0.4,22,0.084507,Confident in his coverage choices,0,0
6,784734,Linda,47.0,1,519,1,1.0,0.68,76,1.0,Not mentioned.,0,0
7,598542,Jeremy,30.0,1,403,0,1.0,1.0,53,0.169014,Not mentioned,0,0
8,282861,Fiona,26.0,1,322,1,0.0,0.0,19,0.014085,Positive (implied),0,0
9,69740,Mark,55.0,1,711,0,1.0,0.52,102,0.225352,Not mentioned.,0,1


In [None]:
X = data_mart_df.drop(['response'], axis=1) #features (independent variables)
y = data_mart_df['response'] #target (dependent variable)
z = extracted_df.drop(['response', 'name'], axis=1) #features (independent variables)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, z.shape

((304887, 11), (76222, 11), (304887,), (76222,), (10, 11))

In [None]:
result_df = pd.DataFrame()
result_df['id'] = X_test['id'].copy()
result_df['actual_response'] = y_test.values
result_df.head()

Unnamed: 0,id,actual_response
200222,200223,0
49766,49767,0
172201,172202,0
160713,160714,0
53272,53273,0


In [None]:
result_df.head()

Unnamed: 0,id,actual_response
200222,200223,0
49766,49767,0
172201,172202,0
160713,160714,0
53272,53273,0


In [None]:
X_train['age'].min(), X_train['age'].mean()

(20, 38.835581707321076)

In [None]:
# Create bins for 'age'
bins = [X_train['age'].min(),
        X_train['age'].mean() - X_train['age'].std(),
        X_train['age'].mean(),
        X_train['age'].mean() + X_train['age'].std(),
        X_train['age'].max()]

# Split dataset acording to the bins
bins_df = pd.cut(X_train['age'], bins=bins)

# Create groups with bins
data_set_size = X_train.groupby(bins_df).size().reset_index(name='count')

print("Bins and sizes:")
print(data_set_size)

Bins and sizes:
                age   count
0    (20.0, 23.326]   49345
1  (23.326, 38.836]  107937
2  (38.836, 54.345]   89202
3    (54.345, 85.0]   53434


In [None]:
# Create bins for 'age'
age_bin_ranges = [X_train['age'].min(),
                  X_train['age'].mean() - X_train['age'].std(),
                  X_train['age'].mean(),
                  X_train['age'].mean() + X_train['age'].std(),
                  X_train['age'].max()]
X_train['age'] = pd.cut(X_train['age'],
                        bins=age_bin_ranges,
                        include_lowest=True,
                        duplicates='drop').cat.codes
X_train['age'] = scaler.fit_transform(X_train['age'].values.reshape(-1, 1))
#######
X_test['age'] = pd.cut(X_test['age'],
                        bins=age_bin_ranges,
                        include_lowest=True,
                        duplicates='drop').cat.codes
X_test['age'] = scaler.fit_transform(X_test['age'].values.reshape(-1, 1))
#######
z['age'] = pd.cut(z['age'],
                        bins=age_bin_ranges,
                        include_lowest=True,
                        duplicates='drop').cat.codes
z['age'] = scaler.fit_transform(z['age'].values.reshape(-1, 1))

z['age'].head(10)

0    0.0
1    0.5
2    0.0
3    0.5
4    0.0
5    0.5
6    0.5
7    0.0
8    0.0
9    1.0
Name: age, dtype: float64

In [None]:
# Create bins for 'policy_sales_channel'
bins = [X_train['policy_sales_channel'].min(),
        X_train['policy_sales_channel'].median() - X_train['policy_sales_channel'].std(),
        X_train['policy_sales_channel'].median(),
        #X_train['policy_sales_channel'].median() + X_train['policy_sales_channel'].std(),
        X_train['policy_sales_channel'].max()]

# Split dataset acording to the bins
bins_df = pd.cut(X_train['policy_sales_channel'], bins=bins)

# Create groups with bins
data_set_size = X_train.groupby(bins_df).size().reset_index(name='count')

print("Bins and sizes:")
print(data_set_size)

Bins and sizes:
  policy_sales_channel   count
0         (1.0, 76.77]   81539
1       (76.77, 131.0]   70154
2       (131.0, 163.0]  152352


In [None]:
z['policy_sales_channel'].astype(int).unique()

array([ 88,  45,  12,  90,  33,  22,  76,  53,  19, 102])

In [None]:
# Create bins for 'policy_sales_channel'
policy_sales_channel_bin_ranges = [X_train['policy_sales_channel'].min(),
                                   X_train['policy_sales_channel'].median() - X_train['policy_sales_channel'].std(),
                                   X_train['policy_sales_channel'].median(),
                                   X_train['policy_sales_channel'].max()]
X_train['policy_sales_channel'] = pd.cut(X_train['policy_sales_channel'],
                                         bins=policy_sales_channel_bin_ranges,
                                         include_lowest=True,
                                         duplicates='drop').cat.codes
X_train['policy_sales_channel'] = scaler.fit_transform(X_train['policy_sales_channel'].values.reshape(-1, 1))
#######
X_test['policy_sales_channel'] = pd.cut(X_test['policy_sales_channel'],
                                        bins=policy_sales_channel_bin_ranges,
                                        include_lowest=True,
                                        duplicates='drop').cat.codes
X_test['policy_sales_channel'] = scaler.fit_transform(X_test['policy_sales_channel'].values.reshape(-1, 1))
#######
z['policy_sales_channel'] = z['policy_sales_channel'].astype(int)
z['policy_sales_channel'] = pd.cut(z['policy_sales_channel'],
                                        bins=policy_sales_channel_bin_ranges,
                                        include_lowest=True,
                                        duplicates='drop').cat.codes
z['policy_sales_channel'] = scaler.fit_transform(z['policy_sales_channel'].values.reshape(-1, 1))



In [None]:
X_train.head()

Unnamed: 0,id,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,is_female,is_damage
332803,332804,0.666667,1,15,0,0.5,0.093422,0.0,0.750865,1,1
116248,116249,0.333333,1,11,0,0.5,0.037986,0.0,0.065744,0,1
255005,255006,0.0,1,30,1,0.0,0.07937,1.0,0.539792,0,0
317474,317475,0.0,1,41,1,0.0,0.049259,1.0,0.923875,1,0
344212,344213,1.0,1,48,0,1.0,0.0,1.0,0.50173,0,1


In [None]:
z['region_code'] = z['region_code'].astype(int)

In [None]:
#region_code
X_train, X_test, z = replace_cat_info_to_count(X_train, X_test, z, 'region_code')
z.head()

Unnamed: 0,id,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,is_female,is_damage
0,191150,0.0,0,1,0,1.0,0.24,1.0,0.140845,0,0
1,76655,0.5,1,1,1,1.0,0.84,0.0,0.676056,0,1
2,98173,0.0,1,1,0,0.8125,0.12,0.0,0.056338,0,0
3,906632,0.5,1,1,0,1.0,0.44,1.0,0.0,0,0
4,803321,0.0,1,1,0,0.8125,0.32,0.0,1.0,0,1


In [None]:
z.region_code.unique()

array([1])

In [None]:
#policy_sales_channel
X_train, X_test, z = replace_cat_info_to_count(X_train, X_test, z, 'policy_sales_channel')
X_train.head()

Unnamed: 0,id,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,is_female,is_damage
332803,332804,0.666667,1,0.12355,0,0.5,0.093422,0.147158,0.750865,1,1
116248,116249,0.333333,1,0.085181,0,0.5,0.037986,0.147158,0.065744,0,1
255005,255006,0.0,1,0.113036,1,0.0,0.07937,1.0,0.539792,0,0
317474,317475,0.0,1,0.170194,1,0.0,0.049259,1.0,0.923875,1,0
344212,344213,1.0,1,0.042341,0,1.0,0.0,1.0,0.50173,0,1


In [None]:
X_train.drop(columns=['id'], inplace=True)
X_test.drop(columns=['id'], inplace=True)


In [None]:
z.drop(columns=['id'], inplace=True)

In [None]:
z.head()

Unnamed: 0,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,is_female,is_damage
0,0.0,0,1,0,1.0,0.24,190657,0.140845,0,0
1,0.5,1,1,1,1.0,0.84,102810,0.676056,0,1
2,0.0,1,1,0,0.8125,0.12,102810,0.056338,0,0
3,0.5,1,1,0,1.0,0.44,190657,0.0,0,0
4,0.0,1,1,0,0.8125,0.32,102810,1.0,0,1


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, z.shape

((304887, 10), (76222, 10), (304887,), (76222,), (10, 10))

##XGB Regressor

In [None]:
# Define XGBRegressor

%%time
xgb = XGBRegressor(enable_categorical=True)

# Define parameter grid for grid search
param_grid = { 'gamma' : [0.3, 0.1], 'learning_rate' : [0.01, 0.015],
              'max_depth' : [2, 5],
              'n_estimators' : [150, 250],
              'nthread' : [-1], 'reg_alpha' : [1], 'reg_lambda' : [1], 'seed' : [10]
              }

# Perform grid search with time series cross-validation
#tscv = TimeSeriesSplit(n_splits=-1)
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, verbose=True)
grid_search.fit(X_train, y_train, eval_set=[(X_train, y_train)], early_stopping_rounds=50)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[0]	validation_0-rmse:0.32679
[1]	validation_0-rmse:0.32635




[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32547
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32463
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32304
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31742
[27]	validation_0-rmse:0.31714
[28]	validation_0-rmse:0.31686
[29]	validation_0-rmse:0.31659
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31606
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31555
[34]	validation_



[2]	validation_0-rmse:0.32590
[3]	validation_0-rmse:0.32546
[4]	validation_0-rmse:0.32504
[5]	validation_0-rmse:0.32462
[6]	validation_0-rmse:0.32421
[7]	validation_0-rmse:0.32381
[8]	validation_0-rmse:0.32341
[9]	validation_0-rmse:0.32302
[10]	validation_0-rmse:0.32264
[11]	validation_0-rmse:0.32226
[12]	validation_0-rmse:0.32189
[13]	validation_0-rmse:0.32153
[14]	validation_0-rmse:0.32117
[15]	validation_0-rmse:0.32082
[16]	validation_0-rmse:0.32048
[17]	validation_0-rmse:0.32014
[18]	validation_0-rmse:0.31981
[19]	validation_0-rmse:0.31948
[20]	validation_0-rmse:0.31916
[21]	validation_0-rmse:0.31885
[22]	validation_0-rmse:0.31854
[23]	validation_0-rmse:0.31824
[24]	validation_0-rmse:0.31794
[25]	validation_0-rmse:0.31765
[26]	validation_0-rmse:0.31736
[27]	validation_0-rmse:0.31708
[28]	validation_0-rmse:0.31680
[29]	validation_0-rmse:0.31653
[30]	validation_0-rmse:0.31626
[31]	validation_0-rmse:0.31600
[32]	validation_0-rmse:0.31574
[33]	validation_0-rmse:0.31549
[34]	validation_



[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32548
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32464
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32305
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31741
[27]	validation_0-rmse:0.31713
[28]	validation_0-rmse:0.31685
[29]	validation_0-rmse:0.31658
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31605
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31554
[34]	validation_



[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32547
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32463
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32304
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31742
[27]	validation_0-rmse:0.31714
[28]	validation_0-rmse:0.31686
[29]	validation_0-rmse:0.31659
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31606
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31555
[34]	validation_



[2]	validation_0-rmse:0.32590
[3]	validation_0-rmse:0.32546
[4]	validation_0-rmse:0.32504
[5]	validation_0-rmse:0.32462
[6]	validation_0-rmse:0.32421
[7]	validation_0-rmse:0.32381
[8]	validation_0-rmse:0.32341
[9]	validation_0-rmse:0.32302
[10]	validation_0-rmse:0.32264
[11]	validation_0-rmse:0.32226
[12]	validation_0-rmse:0.32189
[13]	validation_0-rmse:0.32153
[14]	validation_0-rmse:0.32117
[15]	validation_0-rmse:0.32082
[16]	validation_0-rmse:0.32048
[17]	validation_0-rmse:0.32014
[18]	validation_0-rmse:0.31981
[19]	validation_0-rmse:0.31948
[20]	validation_0-rmse:0.31916
[21]	validation_0-rmse:0.31885
[22]	validation_0-rmse:0.31854
[23]	validation_0-rmse:0.31824
[24]	validation_0-rmse:0.31794
[25]	validation_0-rmse:0.31765
[26]	validation_0-rmse:0.31736
[27]	validation_0-rmse:0.31708
[28]	validation_0-rmse:0.31680
[29]	validation_0-rmse:0.31653
[30]	validation_0-rmse:0.31626
[31]	validation_0-rmse:0.31600
[32]	validation_0-rmse:0.31574
[33]	validation_0-rmse:0.31549
[34]	validation_



[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32548
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32464
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32305
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31741
[27]	validation_0-rmse:0.31713
[28]	validation_0-rmse:0.31685
[29]	validation_0-rmse:0.31658
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31605
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31554
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32514
[4]	validation_0-rmse:0.32463
[5]	validation_0-rmse:0.32414
[6]	validation_0-rmse:0.32365
[7]	validation_0-rmse:0.32317
[8]	validation_0-rmse:0.32270
[9]	validation_0-rmse:0.32224
[10]	validation_0-rmse:0.32179
[11]	validation_0-rmse:0.32135
[12]	validation_0-rmse:0.32091
[13]	validation_0-rmse:0.32048
[14]	validation_0-rmse:0.32007
[15]	validation_0-rmse:0.31965
[16]	validation_0-rmse:0.31925
[17]	validation_0-rmse:0.31885
[18]	validation_0-rmse:0.31846
[19]	validation_0-rmse:0.31808
[20]	validation_0-rmse:0.31771
[21]	validation_0-rmse:0.31734
[22]	validation_0-rmse:0.31698
[23]	validation_0-rmse:0.31662
[24]	validation_0-rmse:0.31628
[25]	validation_0-rmse:0.31593
[26]	validation_0-rmse:0.31560
[27]	validation_0-rmse:0.31527
[28]	validation_0-rmse:0.31495
[29]	validation_0-rmse:0.31463
[30]	validation_0-rmse:0.31432
[31]	validation_0-rmse:0.31402
[32]	validation_0-rmse:0.31372
[33]	validation_0-rmse:0.31341
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32513
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32413
[6]	validation_0-rmse:0.32364
[7]	validation_0-rmse:0.32316
[8]	validation_0-rmse:0.32269
[9]	validation_0-rmse:0.32223
[10]	validation_0-rmse:0.32178
[11]	validation_0-rmse:0.32133
[12]	validation_0-rmse:0.32089
[13]	validation_0-rmse:0.32047
[14]	validation_0-rmse:0.32005
[15]	validation_0-rmse:0.31962
[16]	validation_0-rmse:0.31919
[17]	validation_0-rmse:0.31880
[18]	validation_0-rmse:0.31840
[19]	validation_0-rmse:0.31801
[20]	validation_0-rmse:0.31761
[21]	validation_0-rmse:0.31723
[22]	validation_0-rmse:0.31685
[23]	validation_0-rmse:0.31650
[24]	validation_0-rmse:0.31615
[25]	validation_0-rmse:0.31579
[26]	validation_0-rmse:0.31544
[27]	validation_0-rmse:0.31509
[28]	validation_0-rmse:0.31476
[29]	validation_0-rmse:0.31444
[30]	validation_0-rmse:0.31413
[31]	validation_0-rmse:0.31381
[32]	validation_0-rmse:0.31351
[33]	validation_0-rmse:0.31321
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32512
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32412
[6]	validation_0-rmse:0.32362
[7]	validation_0-rmse:0.32313
[8]	validation_0-rmse:0.32264
[9]	validation_0-rmse:0.32217
[10]	validation_0-rmse:0.32171
[11]	validation_0-rmse:0.32127
[12]	validation_0-rmse:0.32082
[13]	validation_0-rmse:0.32038
[14]	validation_0-rmse:0.31995
[15]	validation_0-rmse:0.31952
[16]	validation_0-rmse:0.31912
[17]	validation_0-rmse:0.31872
[18]	validation_0-rmse:0.31832
[19]	validation_0-rmse:0.31792
[20]	validation_0-rmse:0.31753
[21]	validation_0-rmse:0.31716
[22]	validation_0-rmse:0.31679
[23]	validation_0-rmse:0.31642
[24]	validation_0-rmse:0.31606
[25]	validation_0-rmse:0.31572
[26]	validation_0-rmse:0.31537
[27]	validation_0-rmse:0.31503
[28]	validation_0-rmse:0.31470
[29]	validation_0-rmse:0.31438
[30]	validation_0-rmse:0.31406
[31]	validation_0-rmse:0.31375
[32]	validation_0-rmse:0.31345
[33]	validation_0-rmse:0.31315
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32514
[4]	validation_0-rmse:0.32463
[5]	validation_0-rmse:0.32414
[6]	validation_0-rmse:0.32365
[7]	validation_0-rmse:0.32317
[8]	validation_0-rmse:0.32270
[9]	validation_0-rmse:0.32224
[10]	validation_0-rmse:0.32179
[11]	validation_0-rmse:0.32135
[12]	validation_0-rmse:0.32091
[13]	validation_0-rmse:0.32048
[14]	validation_0-rmse:0.32007
[15]	validation_0-rmse:0.31965
[16]	validation_0-rmse:0.31925
[17]	validation_0-rmse:0.31885
[18]	validation_0-rmse:0.31846
[19]	validation_0-rmse:0.31808
[20]	validation_0-rmse:0.31771
[21]	validation_0-rmse:0.31734
[22]	validation_0-rmse:0.31698
[23]	validation_0-rmse:0.31662
[24]	validation_0-rmse:0.31628
[25]	validation_0-rmse:0.31593
[26]	validation_0-rmse:0.31560
[27]	validation_0-rmse:0.31527
[28]	validation_0-rmse:0.31495
[29]	validation_0-rmse:0.31463
[30]	validation_0-rmse:0.31432
[31]	validation_0-rmse:0.31402
[32]	validation_0-rmse:0.31372
[33]	validation_0-rmse:0.31341
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32513
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32413
[6]	validation_0-rmse:0.32364
[7]	validation_0-rmse:0.32316
[8]	validation_0-rmse:0.32269
[9]	validation_0-rmse:0.32223
[10]	validation_0-rmse:0.32178
[11]	validation_0-rmse:0.32133
[12]	validation_0-rmse:0.32089
[13]	validation_0-rmse:0.32047
[14]	validation_0-rmse:0.32005
[15]	validation_0-rmse:0.31962
[16]	validation_0-rmse:0.31919
[17]	validation_0-rmse:0.31880
[18]	validation_0-rmse:0.31840
[19]	validation_0-rmse:0.31801
[20]	validation_0-rmse:0.31761
[21]	validation_0-rmse:0.31723
[22]	validation_0-rmse:0.31685
[23]	validation_0-rmse:0.31650
[24]	validation_0-rmse:0.31615
[25]	validation_0-rmse:0.31579
[26]	validation_0-rmse:0.31544
[27]	validation_0-rmse:0.31509
[28]	validation_0-rmse:0.31476
[29]	validation_0-rmse:0.31444
[30]	validation_0-rmse:0.31413
[31]	validation_0-rmse:0.31381
[32]	validation_0-rmse:0.31351
[33]	validation_0-rmse:0.31321
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32512
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32412
[6]	validation_0-rmse:0.32362
[7]	validation_0-rmse:0.32313
[8]	validation_0-rmse:0.32264
[9]	validation_0-rmse:0.32217
[10]	validation_0-rmse:0.32171
[11]	validation_0-rmse:0.32127
[12]	validation_0-rmse:0.32082
[13]	validation_0-rmse:0.32038
[14]	validation_0-rmse:0.31995
[15]	validation_0-rmse:0.31952
[16]	validation_0-rmse:0.31912
[17]	validation_0-rmse:0.31872
[18]	validation_0-rmse:0.31832
[19]	validation_0-rmse:0.31792
[20]	validation_0-rmse:0.31753
[21]	validation_0-rmse:0.31716
[22]	validation_0-rmse:0.31679
[23]	validation_0-rmse:0.31642
[24]	validation_0-rmse:0.31606
[25]	validation_0-rmse:0.31572
[26]	validation_0-rmse:0.31537
[27]	validation_0-rmse:0.31503
[28]	validation_0-rmse:0.31470
[29]	validation_0-rmse:0.31438
[30]	validation_0-rmse:0.31406
[31]	validation_0-rmse:0.31375
[32]	validation_0-rmse:0.31345
[33]	validation_0-rmse:0.31315
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32120
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31968
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31873
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31783
[17]	validation_0-rmse:0.31740
[18]	validation_0-rmse:0.31698
[19]	validation_0-rmse:0.31657
[20]	validation_0-rmse:0.31617
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31504
[24]	validation_0-rmse:0.31468
[25]	validation_0-rmse:0.31433
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31334
[29]	validation_0-rmse:0.31303
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31243
[32]	validation_0-rmse:0.31214
[33]	validation_0-rmse:0.31186
[34]	validation_



[2]	validation_0-rmse:0.32525
[3]	validation_0-rmse:0.32461
[4]	validation_0-rmse:0.32400
[5]	validation_0-rmse:0.32340
[6]	validation_0-rmse:0.32282
[7]	validation_0-rmse:0.32225
[8]	validation_0-rmse:0.32170
[9]	validation_0-rmse:0.32116
[10]	validation_0-rmse:0.32064
[11]	validation_0-rmse:0.32013
[12]	validation_0-rmse:0.31963
[13]	validation_0-rmse:0.31915
[14]	validation_0-rmse:0.31868
[15]	validation_0-rmse:0.31822
[16]	validation_0-rmse:0.31777
[17]	validation_0-rmse:0.31734
[18]	validation_0-rmse:0.31692
[19]	validation_0-rmse:0.31651
[20]	validation_0-rmse:0.31611
[21]	validation_0-rmse:0.31572
[22]	validation_0-rmse:0.31534
[23]	validation_0-rmse:0.31497
[24]	validation_0-rmse:0.31461
[25]	validation_0-rmse:0.31426
[26]	validation_0-rmse:0.31392
[27]	validation_0-rmse:0.31359
[28]	validation_0-rmse:0.31327
[29]	validation_0-rmse:0.31296
[30]	validation_0-rmse:0.31265
[31]	validation_0-rmse:0.31235
[32]	validation_0-rmse:0.31206
[33]	validation_0-rmse:0.31178
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32119
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31967
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31872
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31782
[17]	validation_0-rmse:0.31739
[18]	validation_0-rmse:0.31697
[19]	validation_0-rmse:0.31656
[20]	validation_0-rmse:0.31616
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31503
[24]	validation_0-rmse:0.31467
[25]	validation_0-rmse:0.31432
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31333
[29]	validation_0-rmse:0.31302
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31242
[32]	validation_0-rmse:0.31213
[33]	validation_0-rmse:0.31185
[34]	validation_



[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32120
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31968
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31873
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31783
[17]	validation_0-rmse:0.31740
[18]	validation_0-rmse:0.31698
[19]	validation_0-rmse:0.31657
[20]	validation_0-rmse:0.31617
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31504
[24]	validation_0-rmse:0.31468
[25]	validation_0-rmse:0.31433
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31334
[29]	validation_0-rmse:0.31303
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31243
[32]	validation_0-rmse:0.31214
[33]	validation_0-rmse:0.31186
[34]	validation_0-rmse:0.31158
[35]	validation



[2]	validation_0-rmse:0.32525
[3]	validation_0-rmse:0.32461
[4]	validation_0-rmse:0.32400
[5]	validation_0-rmse:0.32340
[6]	validation_0-rmse:0.32282
[7]	validation_0-rmse:0.32225
[8]	validation_0-rmse:0.32170
[9]	validation_0-rmse:0.32116
[10]	validation_0-rmse:0.32064
[11]	validation_0-rmse:0.32013
[12]	validation_0-rmse:0.31963
[13]	validation_0-rmse:0.31915
[14]	validation_0-rmse:0.31868
[15]	validation_0-rmse:0.31822
[16]	validation_0-rmse:0.31777
[17]	validation_0-rmse:0.31734
[18]	validation_0-rmse:0.31692
[19]	validation_0-rmse:0.31651
[20]	validation_0-rmse:0.31611
[21]	validation_0-rmse:0.31572
[22]	validation_0-rmse:0.31534
[23]	validation_0-rmse:0.31497
[24]	validation_0-rmse:0.31461
[25]	validation_0-rmse:0.31426
[26]	validation_0-rmse:0.31392
[27]	validation_0-rmse:0.31359
[28]	validation_0-rmse:0.31327
[29]	validation_0-rmse:0.31296
[30]	validation_0-rmse:0.31265
[31]	validation_0-rmse:0.31235
[32]	validation_0-rmse:0.31206
[33]	validation_0-rmse:0.31178
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32119
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31967
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31872
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31782
[17]	validation_0-rmse:0.31739
[18]	validation_0-rmse:0.31697
[19]	validation_0-rmse:0.31656
[20]	validation_0-rmse:0.31616
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31503
[24]	validation_0-rmse:0.31467
[25]	validation_0-rmse:0.31432
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31333
[29]	validation_0-rmse:0.31302
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31242
[32]	validation_0-rmse:0.31213
[33]	validation_0-rmse:0.31185
[34]	validation_



[1]	validation_0-rmse:0.32565
[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32413
[4]	validation_0-rmse:0.32340
[5]	validation_0-rmse:0.32269
[6]	validation_0-rmse:0.32200
[7]	validation_0-rmse:0.32133
[8]	validation_0-rmse:0.32068
[9]	validation_0-rmse:0.32005
[10]	validation_0-rmse:0.31943
[11]	validation_0-rmse:0.31884
[12]	validation_0-rmse:0.31825
[13]	validation_0-rmse:0.31769
[14]	validation_0-rmse:0.31714
[15]	validation_0-rmse:0.31660
[16]	validation_0-rmse:0.31608
[17]	validation_0-rmse:0.31558
[18]	validation_0-rmse:0.31509
[19]	validation_0-rmse:0.31461
[20]	validation_0-rmse:0.31414
[21]	validation_0-rmse:0.31369
[22]	validation_0-rmse:0.31323
[23]	validation_0-rmse:0.31280
[24]	validation_0-rmse:0.31239
[25]	validation_0-rmse:0.31196
[26]	validation_0-rmse:0.31157
[27]	validation_0-rmse:0.31118
[28]	validation_0-rmse:0.31079
[29]	validation_0-rmse:0.31041
[30]	validation_0-rmse:0.31006
[31]	validation_0-rmse:0.30972
[32]	validation_0-rmse:0.30936
[33]	validation_0



[1]	validation_0-rmse:0.32564
[2]	validation_0-rmse:0.32487
[3]	validation_0-rmse:0.32412
[4]	validation_0-rmse:0.32339
[5]	validation_0-rmse:0.32268
[6]	validation_0-rmse:0.32199
[7]	validation_0-rmse:0.32132
[8]	validation_0-rmse:0.32066
[9]	validation_0-rmse:0.32003
[10]	validation_0-rmse:0.31941
[11]	validation_0-rmse:0.31879
[12]	validation_0-rmse:0.31818
[13]	validation_0-rmse:0.31759
[14]	validation_0-rmse:0.31703
[15]	validation_0-rmse:0.31650
[16]	validation_0-rmse:0.31595
[17]	validation_0-rmse:0.31542
[18]	validation_0-rmse:0.31491
[19]	validation_0-rmse:0.31443
[20]	validation_0-rmse:0.31396
[21]	validation_0-rmse:0.31349
[22]	validation_0-rmse:0.31305
[23]	validation_0-rmse:0.31262
[24]	validation_0-rmse:0.31219
[25]	validation_0-rmse:0.31178
[26]	validation_0-rmse:0.31137
[27]	validation_0-rmse:0.31097
[28]	validation_0-rmse:0.31060
[29]	validation_0-rmse:0.31024
[30]	validation_0-rmse:0.30987
[31]	validation_0-rmse:0.30953
[32]	validation_0-rmse:0.30920
[33]	validation_0



[1]	validation_0-rmse:0.32565
[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32411
[4]	validation_0-rmse:0.32335
[5]	validation_0-rmse:0.32262
[6]	validation_0-rmse:0.32191
[7]	validation_0-rmse:0.32124
[8]	validation_0-rmse:0.32059
[9]	validation_0-rmse:0.31994
[10]	validation_0-rmse:0.31930
[11]	validation_0-rmse:0.31868
[12]	validation_0-rmse:0.31810
[13]	validation_0-rmse:0.31753
[14]	validation_0-rmse:0.31696
[15]	validation_0-rmse:0.31641
[16]	validation_0-rmse:0.31587
[17]	validation_0-rmse:0.31536
[18]	validation_0-rmse:0.31485
[19]	validation_0-rmse:0.31437
[20]	validation_0-rmse:0.31389
[21]	validation_0-rmse:0.31344
[22]	validation_0-rmse:0.31300
[23]	validation_0-rmse:0.31255
[24]	validation_0-rmse:0.31214
[25]	validation_0-rmse:0.31172
[26]	validation_0-rmse:0.31133
[27]	validation_0-rmse:0.31093
[28]	validation_0-rmse:0.31056
[29]	validation_0-rmse:0.31018
[30]	validation_0-rmse:0.30983
[31]	validation_0-rmse:0.30948
[32]	validation_0-rmse:0.30915
[33]	validation_0



[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32413
[4]	validation_0-rmse:0.32340
[5]	validation_0-rmse:0.32269
[6]	validation_0-rmse:0.32200
[7]	validation_0-rmse:0.32133
[8]	validation_0-rmse:0.32068
[9]	validation_0-rmse:0.32005
[10]	validation_0-rmse:0.31943
[11]	validation_0-rmse:0.31884
[12]	validation_0-rmse:0.31825
[13]	validation_0-rmse:0.31769
[14]	validation_0-rmse:0.31714
[15]	validation_0-rmse:0.31660
[16]	validation_0-rmse:0.31608
[17]	validation_0-rmse:0.31558
[18]	validation_0-rmse:0.31509
[19]	validation_0-rmse:0.31461
[20]	validation_0-rmse:0.31414
[21]	validation_0-rmse:0.31369
[22]	validation_0-rmse:0.31323
[23]	validation_0-rmse:0.31280
[24]	validation_0-rmse:0.31239
[25]	validation_0-rmse:0.31196
[26]	validation_0-rmse:0.31157
[27]	validation_0-rmse:0.31118
[28]	validation_0-rmse:0.31079
[29]	validation_0-rmse:0.31041
[30]	validation_0-rmse:0.31006
[31]	validation_0-rmse:0.30972
[32]	validation_0-rmse:0.30936
[33]	validation_0-rmse:0.30904
[34]	validation_



[2]	validation_0-rmse:0.32487
[3]	validation_0-rmse:0.32412
[4]	validation_0-rmse:0.32339
[5]	validation_0-rmse:0.32268
[6]	validation_0-rmse:0.32199
[7]	validation_0-rmse:0.32132
[8]	validation_0-rmse:0.32066
[9]	validation_0-rmse:0.32003
[10]	validation_0-rmse:0.31941
[11]	validation_0-rmse:0.31879
[12]	validation_0-rmse:0.31818
[13]	validation_0-rmse:0.31759
[14]	validation_0-rmse:0.31703
[15]	validation_0-rmse:0.31650
[16]	validation_0-rmse:0.31595
[17]	validation_0-rmse:0.31542
[18]	validation_0-rmse:0.31491
[19]	validation_0-rmse:0.31443
[20]	validation_0-rmse:0.31396
[21]	validation_0-rmse:0.31349
[22]	validation_0-rmse:0.31305
[23]	validation_0-rmse:0.31262
[24]	validation_0-rmse:0.31219
[25]	validation_0-rmse:0.31178
[26]	validation_0-rmse:0.31137
[27]	validation_0-rmse:0.31097
[28]	validation_0-rmse:0.31060
[29]	validation_0-rmse:0.31024
[30]	validation_0-rmse:0.30987
[31]	validation_0-rmse:0.30953
[32]	validation_0-rmse:0.30920
[33]	validation_0-rmse:0.30886
[34]	validation_



[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32411
[4]	validation_0-rmse:0.32335
[5]	validation_0-rmse:0.32262
[6]	validation_0-rmse:0.32191
[7]	validation_0-rmse:0.32124
[8]	validation_0-rmse:0.32059
[9]	validation_0-rmse:0.31994
[10]	validation_0-rmse:0.31930
[11]	validation_0-rmse:0.31868
[12]	validation_0-rmse:0.31810
[13]	validation_0-rmse:0.31753
[14]	validation_0-rmse:0.31696
[15]	validation_0-rmse:0.31641
[16]	validation_0-rmse:0.31587
[17]	validation_0-rmse:0.31536
[18]	validation_0-rmse:0.31485
[19]	validation_0-rmse:0.31437
[20]	validation_0-rmse:0.31389
[21]	validation_0-rmse:0.31344
[22]	validation_0-rmse:0.31300
[23]	validation_0-rmse:0.31255
[24]	validation_0-rmse:0.31214
[25]	validation_0-rmse:0.31172
[26]	validation_0-rmse:0.31133
[27]	validation_0-rmse:0.31093
[28]	validation_0-rmse:0.31056
[29]	validation_0-rmse:0.31018
[30]	validation_0-rmse:0.30983
[31]	validation_0-rmse:0.30948
[32]	validation_0-rmse:0.30915
[33]	validation_0-rmse:0.30882
[34]	validation_



[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32547
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32463
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32304
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31742
[27]	validation_0-rmse:0.31714
[28]	validation_0-rmse:0.31686
[29]	validation_0-rmse:0.31659
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31606
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31555
[34]	validation_



[2]	validation_0-rmse:0.32590
[3]	validation_0-rmse:0.32546
[4]	validation_0-rmse:0.32504
[5]	validation_0-rmse:0.32462
[6]	validation_0-rmse:0.32421
[7]	validation_0-rmse:0.32381
[8]	validation_0-rmse:0.32341
[9]	validation_0-rmse:0.32302
[10]	validation_0-rmse:0.32264
[11]	validation_0-rmse:0.32226
[12]	validation_0-rmse:0.32189
[13]	validation_0-rmse:0.32153
[14]	validation_0-rmse:0.32117
[15]	validation_0-rmse:0.32082
[16]	validation_0-rmse:0.32048
[17]	validation_0-rmse:0.32014
[18]	validation_0-rmse:0.31981
[19]	validation_0-rmse:0.31948
[20]	validation_0-rmse:0.31916
[21]	validation_0-rmse:0.31885
[22]	validation_0-rmse:0.31854
[23]	validation_0-rmse:0.31824
[24]	validation_0-rmse:0.31794
[25]	validation_0-rmse:0.31765
[26]	validation_0-rmse:0.31736
[27]	validation_0-rmse:0.31708
[28]	validation_0-rmse:0.31680
[29]	validation_0-rmse:0.31653
[30]	validation_0-rmse:0.31626
[31]	validation_0-rmse:0.31600
[32]	validation_0-rmse:0.31574
[33]	validation_0-rmse:0.31549
[34]	validation_



[3]	validation_0-rmse:0.32548
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32464
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32305
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31741
[27]	validation_0-rmse:0.31713
[28]	validation_0-rmse:0.31685
[29]	validation_0-rmse:0.31658
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31605
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31554
[34]	validation_0-rmse:0.31530
[35]	validation



[1]	validation_0-rmse:0.32635
[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32547
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32463
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32304
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31742
[27]	validation_0-rmse:0.31714
[28]	validation_0-rmse:0.31686
[29]	validation_0-rmse:0.31659
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31606
[32]	validation_0-rmse:0.31580
[33]	validation_0



[2]	validation_0-rmse:0.32590
[3]	validation_0-rmse:0.32546
[4]	validation_0-rmse:0.32504
[5]	validation_0-rmse:0.32462
[6]	validation_0-rmse:0.32421
[7]	validation_0-rmse:0.32381
[8]	validation_0-rmse:0.32341
[9]	validation_0-rmse:0.32302
[10]	validation_0-rmse:0.32264
[11]	validation_0-rmse:0.32226
[12]	validation_0-rmse:0.32189
[13]	validation_0-rmse:0.32153
[14]	validation_0-rmse:0.32117
[15]	validation_0-rmse:0.32082
[16]	validation_0-rmse:0.32048
[17]	validation_0-rmse:0.32014
[18]	validation_0-rmse:0.31981
[19]	validation_0-rmse:0.31948
[20]	validation_0-rmse:0.31916
[21]	validation_0-rmse:0.31885
[22]	validation_0-rmse:0.31854
[23]	validation_0-rmse:0.31824
[24]	validation_0-rmse:0.31794
[25]	validation_0-rmse:0.31765
[26]	validation_0-rmse:0.31736
[27]	validation_0-rmse:0.31708
[28]	validation_0-rmse:0.31680
[29]	validation_0-rmse:0.31653
[30]	validation_0-rmse:0.31626
[31]	validation_0-rmse:0.31600
[32]	validation_0-rmse:0.31574
[33]	validation_0-rmse:0.31549
[34]	validation_



[2]	validation_0-rmse:0.32591
[3]	validation_0-rmse:0.32548
[4]	validation_0-rmse:0.32505
[5]	validation_0-rmse:0.32464
[6]	validation_0-rmse:0.32423
[7]	validation_0-rmse:0.32383
[8]	validation_0-rmse:0.32343
[9]	validation_0-rmse:0.32305
[10]	validation_0-rmse:0.32266
[11]	validation_0-rmse:0.32229
[12]	validation_0-rmse:0.32192
[13]	validation_0-rmse:0.32156
[14]	validation_0-rmse:0.32121
[15]	validation_0-rmse:0.32086
[16]	validation_0-rmse:0.32052
[17]	validation_0-rmse:0.32018
[18]	validation_0-rmse:0.31985
[19]	validation_0-rmse:0.31953
[20]	validation_0-rmse:0.31921
[21]	validation_0-rmse:0.31890
[22]	validation_0-rmse:0.31859
[23]	validation_0-rmse:0.31829
[24]	validation_0-rmse:0.31799
[25]	validation_0-rmse:0.31770
[26]	validation_0-rmse:0.31741
[27]	validation_0-rmse:0.31713
[28]	validation_0-rmse:0.31685
[29]	validation_0-rmse:0.31658
[30]	validation_0-rmse:0.31632
[31]	validation_0-rmse:0.31605
[32]	validation_0-rmse:0.31580
[33]	validation_0-rmse:0.31554
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32514
[4]	validation_0-rmse:0.32463
[5]	validation_0-rmse:0.32414
[6]	validation_0-rmse:0.32365
[7]	validation_0-rmse:0.32317
[8]	validation_0-rmse:0.32270
[9]	validation_0-rmse:0.32224
[10]	validation_0-rmse:0.32179
[11]	validation_0-rmse:0.32135
[12]	validation_0-rmse:0.32091
[13]	validation_0-rmse:0.32048
[14]	validation_0-rmse:0.32006
[15]	validation_0-rmse:0.31965
[16]	validation_0-rmse:0.31925
[17]	validation_0-rmse:0.31885
[18]	validation_0-rmse:0.31846
[19]	validation_0-rmse:0.31808
[20]	validation_0-rmse:0.31771
[21]	validation_0-rmse:0.31734
[22]	validation_0-rmse:0.31698
[23]	validation_0-rmse:0.31662
[24]	validation_0-rmse:0.31628
[25]	validation_0-rmse:0.31593
[26]	validation_0-rmse:0.31560
[27]	validation_0-rmse:0.31527
[28]	validation_0-rmse:0.31495
[29]	validation_0-rmse:0.31463
[30]	validation_0-rmse:0.31432
[31]	validation_0-rmse:0.31402
[32]	validation_0-rmse:0.31372
[33]	validation_0-rmse:0.31340
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32513
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32413
[6]	validation_0-rmse:0.32364
[7]	validation_0-rmse:0.32316
[8]	validation_0-rmse:0.32269
[9]	validation_0-rmse:0.32223
[10]	validation_0-rmse:0.32178
[11]	validation_0-rmse:0.32133
[12]	validation_0-rmse:0.32089
[13]	validation_0-rmse:0.32047
[14]	validation_0-rmse:0.32004
[15]	validation_0-rmse:0.31961
[16]	validation_0-rmse:0.31919
[17]	validation_0-rmse:0.31879
[18]	validation_0-rmse:0.31840
[19]	validation_0-rmse:0.31800
[20]	validation_0-rmse:0.31761
[21]	validation_0-rmse:0.31723
[22]	validation_0-rmse:0.31685
[23]	validation_0-rmse:0.31649
[24]	validation_0-rmse:0.31615
[25]	validation_0-rmse:0.31579
[26]	validation_0-rmse:0.31544
[27]	validation_0-rmse:0.31509
[28]	validation_0-rmse:0.31476
[29]	validation_0-rmse:0.31444
[30]	validation_0-rmse:0.31413
[31]	validation_0-rmse:0.31381
[32]	validation_0-rmse:0.31351
[33]	validation_0-rmse:0.31321
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32512
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32412
[6]	validation_0-rmse:0.32362
[7]	validation_0-rmse:0.32312
[8]	validation_0-rmse:0.32264
[9]	validation_0-rmse:0.32216
[10]	validation_0-rmse:0.32171
[11]	validation_0-rmse:0.32127
[12]	validation_0-rmse:0.32082
[13]	validation_0-rmse:0.32038
[14]	validation_0-rmse:0.31994
[15]	validation_0-rmse:0.31952
[16]	validation_0-rmse:0.31911
[17]	validation_0-rmse:0.31872
[18]	validation_0-rmse:0.31831
[19]	validation_0-rmse:0.31792
[20]	validation_0-rmse:0.31753
[21]	validation_0-rmse:0.31716
[22]	validation_0-rmse:0.31679
[23]	validation_0-rmse:0.31642
[24]	validation_0-rmse:0.31606
[25]	validation_0-rmse:0.31572
[26]	validation_0-rmse:0.31537
[27]	validation_0-rmse:0.31503
[28]	validation_0-rmse:0.31469
[29]	validation_0-rmse:0.31438
[30]	validation_0-rmse:0.31405
[31]	validation_0-rmse:0.31375
[32]	validation_0-rmse:0.31345
[33]	validation_0-rmse:0.31314
[34]	validation_



[1]	validation_0-rmse:0.32617
[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32514
[4]	validation_0-rmse:0.32463
[5]	validation_0-rmse:0.32414
[6]	validation_0-rmse:0.32365
[7]	validation_0-rmse:0.32317
[8]	validation_0-rmse:0.32270
[9]	validation_0-rmse:0.32224
[10]	validation_0-rmse:0.32179
[11]	validation_0-rmse:0.32135
[12]	validation_0-rmse:0.32091
[13]	validation_0-rmse:0.32048
[14]	validation_0-rmse:0.32006
[15]	validation_0-rmse:0.31965
[16]	validation_0-rmse:0.31925
[17]	validation_0-rmse:0.31885
[18]	validation_0-rmse:0.31846
[19]	validation_0-rmse:0.31808
[20]	validation_0-rmse:0.31771
[21]	validation_0-rmse:0.31734
[22]	validation_0-rmse:0.31698
[23]	validation_0-rmse:0.31662
[24]	validation_0-rmse:0.31628
[25]	validation_0-rmse:0.31593
[26]	validation_0-rmse:0.31560
[27]	validation_0-rmse:0.31527
[28]	validation_0-rmse:0.31495
[29]	validation_0-rmse:0.31463
[30]	validation_0-rmse:0.31432
[31]	validation_0-rmse:0.31402
[32]	validation_0-rmse:0.31372
[33]	validation_0



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32513
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32413
[6]	validation_0-rmse:0.32364
[7]	validation_0-rmse:0.32316
[8]	validation_0-rmse:0.32269
[9]	validation_0-rmse:0.32223
[10]	validation_0-rmse:0.32178
[11]	validation_0-rmse:0.32133
[12]	validation_0-rmse:0.32089
[13]	validation_0-rmse:0.32047
[14]	validation_0-rmse:0.32004
[15]	validation_0-rmse:0.31961
[16]	validation_0-rmse:0.31919
[17]	validation_0-rmse:0.31879
[18]	validation_0-rmse:0.31840
[19]	validation_0-rmse:0.31800
[20]	validation_0-rmse:0.31761
[21]	validation_0-rmse:0.31723
[22]	validation_0-rmse:0.31685
[23]	validation_0-rmse:0.31649
[24]	validation_0-rmse:0.31615
[25]	validation_0-rmse:0.31579
[26]	validation_0-rmse:0.31544
[27]	validation_0-rmse:0.31509
[28]	validation_0-rmse:0.31476
[29]	validation_0-rmse:0.31444
[30]	validation_0-rmse:0.31413
[31]	validation_0-rmse:0.31381
[32]	validation_0-rmse:0.31351
[33]	validation_0-rmse:0.31321
[34]	validation_



[2]	validation_0-rmse:0.32565
[3]	validation_0-rmse:0.32512
[4]	validation_0-rmse:0.32462
[5]	validation_0-rmse:0.32412
[6]	validation_0-rmse:0.32362
[7]	validation_0-rmse:0.32312
[8]	validation_0-rmse:0.32264
[9]	validation_0-rmse:0.32216
[10]	validation_0-rmse:0.32171
[11]	validation_0-rmse:0.32127
[12]	validation_0-rmse:0.32082
[13]	validation_0-rmse:0.32038
[14]	validation_0-rmse:0.31994
[15]	validation_0-rmse:0.31952
[16]	validation_0-rmse:0.31911
[17]	validation_0-rmse:0.31872
[18]	validation_0-rmse:0.31831
[19]	validation_0-rmse:0.31792
[20]	validation_0-rmse:0.31753
[21]	validation_0-rmse:0.31716
[22]	validation_0-rmse:0.31679
[23]	validation_0-rmse:0.31642
[24]	validation_0-rmse:0.31606
[25]	validation_0-rmse:0.31572
[26]	validation_0-rmse:0.31537
[27]	validation_0-rmse:0.31503
[28]	validation_0-rmse:0.31469
[29]	validation_0-rmse:0.31438
[30]	validation_0-rmse:0.31405
[31]	validation_0-rmse:0.31375
[32]	validation_0-rmse:0.31345
[33]	validation_0-rmse:0.31314
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32120
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31968
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31873
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31783
[17]	validation_0-rmse:0.31740
[18]	validation_0-rmse:0.31698
[19]	validation_0-rmse:0.31657
[20]	validation_0-rmse:0.31617
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31504
[24]	validation_0-rmse:0.31468
[25]	validation_0-rmse:0.31433
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31334
[29]	validation_0-rmse:0.31303
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31243
[32]	validation_0-rmse:0.31214
[33]	validation_0-rmse:0.31186
[34]	validation_



[2]	validation_0-rmse:0.32525
[3]	validation_0-rmse:0.32461
[4]	validation_0-rmse:0.32400
[5]	validation_0-rmse:0.32340
[6]	validation_0-rmse:0.32282
[7]	validation_0-rmse:0.32225
[8]	validation_0-rmse:0.32170
[9]	validation_0-rmse:0.32116
[10]	validation_0-rmse:0.32064
[11]	validation_0-rmse:0.32013
[12]	validation_0-rmse:0.31963
[13]	validation_0-rmse:0.31915
[14]	validation_0-rmse:0.31868
[15]	validation_0-rmse:0.31822
[16]	validation_0-rmse:0.31777
[17]	validation_0-rmse:0.31734
[18]	validation_0-rmse:0.31692
[19]	validation_0-rmse:0.31651
[20]	validation_0-rmse:0.31611
[21]	validation_0-rmse:0.31572
[22]	validation_0-rmse:0.31534
[23]	validation_0-rmse:0.31497
[24]	validation_0-rmse:0.31461
[25]	validation_0-rmse:0.31426
[26]	validation_0-rmse:0.31392
[27]	validation_0-rmse:0.31359
[28]	validation_0-rmse:0.31327
[29]	validation_0-rmse:0.31296
[30]	validation_0-rmse:0.31265
[31]	validation_0-rmse:0.31235
[32]	validation_0-rmse:0.31206
[33]	validation_0-rmse:0.31178
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32119
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31967
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31872
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31782
[17]	validation_0-rmse:0.31739
[18]	validation_0-rmse:0.31697
[19]	validation_0-rmse:0.31656
[20]	validation_0-rmse:0.31616
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31503
[24]	validation_0-rmse:0.31467
[25]	validation_0-rmse:0.31432
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31333
[29]	validation_0-rmse:0.31302
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31242
[32]	validation_0-rmse:0.31213
[33]	validation_0-rmse:0.31185
[34]	validation_



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32120
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31968
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31873
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31783
[17]	validation_0-rmse:0.31740
[18]	validation_0-rmse:0.31698
[19]	validation_0-rmse:0.31657
[20]	validation_0-rmse:0.31617
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31504
[24]	validation_0-rmse:0.31468
[25]	validation_0-rmse:0.31433
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31334
[29]	validation_0-rmse:0.31303
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31243
[32]	validation_0-rmse:0.31214
[33]	validation_0-rmse:0.31186
[34]	validation_



[3]	validation_0-rmse:0.32461
[4]	validation_0-rmse:0.32400
[5]	validation_0-rmse:0.32340
[6]	validation_0-rmse:0.32282
[7]	validation_0-rmse:0.32225
[8]	validation_0-rmse:0.32170
[9]	validation_0-rmse:0.32116
[10]	validation_0-rmse:0.32064
[11]	validation_0-rmse:0.32013
[12]	validation_0-rmse:0.31963
[13]	validation_0-rmse:0.31915
[14]	validation_0-rmse:0.31868
[15]	validation_0-rmse:0.31822
[16]	validation_0-rmse:0.31777
[17]	validation_0-rmse:0.31734
[18]	validation_0-rmse:0.31692
[19]	validation_0-rmse:0.31651
[20]	validation_0-rmse:0.31611
[21]	validation_0-rmse:0.31572
[22]	validation_0-rmse:0.31534
[23]	validation_0-rmse:0.31497
[24]	validation_0-rmse:0.31461
[25]	validation_0-rmse:0.31426
[26]	validation_0-rmse:0.31392
[27]	validation_0-rmse:0.31359
[28]	validation_0-rmse:0.31327
[29]	validation_0-rmse:0.31296
[30]	validation_0-rmse:0.31265
[31]	validation_0-rmse:0.31235
[32]	validation_0-rmse:0.31206
[33]	validation_0-rmse:0.31178
[34]	validation_0-rmse:0.31151
[35]	validation



[2]	validation_0-rmse:0.32526
[3]	validation_0-rmse:0.32463
[4]	validation_0-rmse:0.32402
[5]	validation_0-rmse:0.32342
[6]	validation_0-rmse:0.32284
[7]	validation_0-rmse:0.32228
[8]	validation_0-rmse:0.32173
[9]	validation_0-rmse:0.32119
[10]	validation_0-rmse:0.32067
[11]	validation_0-rmse:0.32017
[12]	validation_0-rmse:0.31967
[13]	validation_0-rmse:0.31919
[14]	validation_0-rmse:0.31872
[15]	validation_0-rmse:0.31827
[16]	validation_0-rmse:0.31782
[17]	validation_0-rmse:0.31739
[18]	validation_0-rmse:0.31697
[19]	validation_0-rmse:0.31656
[20]	validation_0-rmse:0.31616
[21]	validation_0-rmse:0.31578
[22]	validation_0-rmse:0.31540
[23]	validation_0-rmse:0.31503
[24]	validation_0-rmse:0.31467
[25]	validation_0-rmse:0.31432
[26]	validation_0-rmse:0.31399
[27]	validation_0-rmse:0.31366
[28]	validation_0-rmse:0.31333
[29]	validation_0-rmse:0.31302
[30]	validation_0-rmse:0.31272
[31]	validation_0-rmse:0.31242
[32]	validation_0-rmse:0.31213
[33]	validation_0-rmse:0.31185
[34]	validation_



[1]	validation_0-rmse:0.32565
[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32413
[4]	validation_0-rmse:0.32340
[5]	validation_0-rmse:0.32269
[6]	validation_0-rmse:0.32200
[7]	validation_0-rmse:0.32133
[8]	validation_0-rmse:0.32068
[9]	validation_0-rmse:0.32005
[10]	validation_0-rmse:0.31943
[11]	validation_0-rmse:0.31883
[12]	validation_0-rmse:0.31825
[13]	validation_0-rmse:0.31769
[14]	validation_0-rmse:0.31714
[15]	validation_0-rmse:0.31660
[16]	validation_0-rmse:0.31608
[17]	validation_0-rmse:0.31558
[18]	validation_0-rmse:0.31508
[19]	validation_0-rmse:0.31461
[20]	validation_0-rmse:0.31414
[21]	validation_0-rmse:0.31369
[22]	validation_0-rmse:0.31323
[23]	validation_0-rmse:0.31280
[24]	validation_0-rmse:0.31238
[25]	validation_0-rmse:0.31196
[26]	validation_0-rmse:0.31156
[27]	validation_0-rmse:0.31116
[28]	validation_0-rmse:0.31079
[29]	validation_0-rmse:0.31040
[30]	validation_0-rmse:0.31005
[31]	validation_0-rmse:0.30972
[32]	validation_0-rmse:0.30936
[33]	validation_0



[1]	validation_0-rmse:0.32564
[2]	validation_0-rmse:0.32487
[3]	validation_0-rmse:0.32412
[4]	validation_0-rmse:0.32339
[5]	validation_0-rmse:0.32268
[6]	validation_0-rmse:0.32199
[7]	validation_0-rmse:0.32132
[8]	validation_0-rmse:0.32066
[9]	validation_0-rmse:0.32003
[10]	validation_0-rmse:0.31941
[11]	validation_0-rmse:0.31878
[12]	validation_0-rmse:0.31818
[13]	validation_0-rmse:0.31758
[14]	validation_0-rmse:0.31703
[15]	validation_0-rmse:0.31650
[16]	validation_0-rmse:0.31595
[17]	validation_0-rmse:0.31542
[18]	validation_0-rmse:0.31491
[19]	validation_0-rmse:0.31443
[20]	validation_0-rmse:0.31396
[21]	validation_0-rmse:0.31349
[22]	validation_0-rmse:0.31305
[23]	validation_0-rmse:0.31262
[24]	validation_0-rmse:0.31218
[25]	validation_0-rmse:0.31178
[26]	validation_0-rmse:0.31137
[27]	validation_0-rmse:0.31097
[28]	validation_0-rmse:0.31060
[29]	validation_0-rmse:0.31024
[30]	validation_0-rmse:0.30987
[31]	validation_0-rmse:0.30953
[32]	validation_0-rmse:0.30920
[33]	validation_0



[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32411
[4]	validation_0-rmse:0.32335
[5]	validation_0-rmse:0.32262
[6]	validation_0-rmse:0.32191
[7]	validation_0-rmse:0.32124
[8]	validation_0-rmse:0.32059
[9]	validation_0-rmse:0.31993
[10]	validation_0-rmse:0.31930
[11]	validation_0-rmse:0.31868
[12]	validation_0-rmse:0.31810
[13]	validation_0-rmse:0.31753
[14]	validation_0-rmse:0.31696
[15]	validation_0-rmse:0.31640
[16]	validation_0-rmse:0.31586
[17]	validation_0-rmse:0.31536
[18]	validation_0-rmse:0.31485
[19]	validation_0-rmse:0.31437
[20]	validation_0-rmse:0.31389
[21]	validation_0-rmse:0.31344
[22]	validation_0-rmse:0.31300
[23]	validation_0-rmse:0.31255
[24]	validation_0-rmse:0.31214
[25]	validation_0-rmse:0.31172
[26]	validation_0-rmse:0.31132
[27]	validation_0-rmse:0.31093
[28]	validation_0-rmse:0.31056
[29]	validation_0-rmse:0.31018
[30]	validation_0-rmse:0.30983
[31]	validation_0-rmse:0.30947
[32]	validation_0-rmse:0.30914
[33]	validation_0-rmse:0.30882
[34]	validation_



[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32413
[4]	validation_0-rmse:0.32340
[5]	validation_0-rmse:0.32269
[6]	validation_0-rmse:0.32200
[7]	validation_0-rmse:0.32133
[8]	validation_0-rmse:0.32068
[9]	validation_0-rmse:0.32005
[10]	validation_0-rmse:0.31943
[11]	validation_0-rmse:0.31883
[12]	validation_0-rmse:0.31825
[13]	validation_0-rmse:0.31769
[14]	validation_0-rmse:0.31714
[15]	validation_0-rmse:0.31660
[16]	validation_0-rmse:0.31608
[17]	validation_0-rmse:0.31558
[18]	validation_0-rmse:0.31508
[19]	validation_0-rmse:0.31461
[20]	validation_0-rmse:0.31414
[21]	validation_0-rmse:0.31369
[22]	validation_0-rmse:0.31323
[23]	validation_0-rmse:0.31280
[24]	validation_0-rmse:0.31238
[25]	validation_0-rmse:0.31196
[26]	validation_0-rmse:0.31156
[27]	validation_0-rmse:0.31116
[28]	validation_0-rmse:0.31079
[29]	validation_0-rmse:0.31040
[30]	validation_0-rmse:0.31005
[31]	validation_0-rmse:0.30972
[32]	validation_0-rmse:0.30936
[33]	validation_0-rmse:0.30904
[34]	validation_



[2]	validation_0-rmse:0.32487
[3]	validation_0-rmse:0.32412
[4]	validation_0-rmse:0.32339
[5]	validation_0-rmse:0.32268
[6]	validation_0-rmse:0.32199
[7]	validation_0-rmse:0.32132
[8]	validation_0-rmse:0.32066
[9]	validation_0-rmse:0.32003
[10]	validation_0-rmse:0.31941
[11]	validation_0-rmse:0.31878
[12]	validation_0-rmse:0.31818
[13]	validation_0-rmse:0.31758
[14]	validation_0-rmse:0.31703
[15]	validation_0-rmse:0.31650
[16]	validation_0-rmse:0.31595
[17]	validation_0-rmse:0.31542
[18]	validation_0-rmse:0.31491
[19]	validation_0-rmse:0.31443
[20]	validation_0-rmse:0.31396
[21]	validation_0-rmse:0.31349
[22]	validation_0-rmse:0.31305
[23]	validation_0-rmse:0.31262
[24]	validation_0-rmse:0.31218
[25]	validation_0-rmse:0.31178
[26]	validation_0-rmse:0.31137
[27]	validation_0-rmse:0.31097
[28]	validation_0-rmse:0.31060
[29]	validation_0-rmse:0.31024
[30]	validation_0-rmse:0.30987
[31]	validation_0-rmse:0.30953
[32]	validation_0-rmse:0.30920
[33]	validation_0-rmse:0.30886
[34]	validation_



[2]	validation_0-rmse:0.32488
[3]	validation_0-rmse:0.32411
[4]	validation_0-rmse:0.32335
[5]	validation_0-rmse:0.32262
[6]	validation_0-rmse:0.32191
[7]	validation_0-rmse:0.32124
[8]	validation_0-rmse:0.32059
[9]	validation_0-rmse:0.31993
[10]	validation_0-rmse:0.31930
[11]	validation_0-rmse:0.31868
[12]	validation_0-rmse:0.31810
[13]	validation_0-rmse:0.31753
[14]	validation_0-rmse:0.31696
[15]	validation_0-rmse:0.31640
[16]	validation_0-rmse:0.31586
[17]	validation_0-rmse:0.31536
[18]	validation_0-rmse:0.31485
[19]	validation_0-rmse:0.31437
[20]	validation_0-rmse:0.31389
[21]	validation_0-rmse:0.31344
[22]	validation_0-rmse:0.31300
[23]	validation_0-rmse:0.31255
[24]	validation_0-rmse:0.31214
[25]	validation_0-rmse:0.31172
[26]	validation_0-rmse:0.31132
[27]	validation_0-rmse:0.31093
[28]	validation_0-rmse:0.31056
[29]	validation_0-rmse:0.31018
[30]	validation_0-rmse:0.30983
[31]	validation_0-rmse:0.30947
[32]	validation_0-rmse:0.30914
[33]	validation_0-rmse:0.30882
[34]	validation_



[6]	validation_0-rmse:0.32200
[7]	validation_0-rmse:0.32133
[8]	validation_0-rmse:0.32068
[9]	validation_0-rmse:0.32005
[10]	validation_0-rmse:0.31943
[11]	validation_0-rmse:0.31880
[12]	validation_0-rmse:0.31820
[13]	validation_0-rmse:0.31761
[14]	validation_0-rmse:0.31705
[15]	validation_0-rmse:0.31652
[16]	validation_0-rmse:0.31598
[17]	validation_0-rmse:0.31545
[18]	validation_0-rmse:0.31495
[19]	validation_0-rmse:0.31448
[20]	validation_0-rmse:0.31399
[21]	validation_0-rmse:0.31354
[22]	validation_0-rmse:0.31308
[23]	validation_0-rmse:0.31265
[24]	validation_0-rmse:0.31223
[25]	validation_0-rmse:0.31181
[26]	validation_0-rmse:0.31142
[27]	validation_0-rmse:0.31103
[28]	validation_0-rmse:0.31064
[29]	validation_0-rmse:0.31028
[30]	validation_0-rmse:0.30991
[31]	validation_0-rmse:0.30957
[32]	validation_0-rmse:0.30924
[33]	validation_0-rmse:0.30890
[34]	validation_0-rmse:0.30859
[35]	validation_0-rmse:0.30829
[36]	validation_0-rmse:0.30797
[37]	validation_0-rmse:0.30767
[38]	validat

In [None]:
grid_search.best_params_

{'gamma': 0.3,
 'learning_rate': 0.015,
 'max_depth': 5,
 'n_estimators': 250,
 'nthread': -1,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'seed': 10}

In [None]:
grid_search.best_score_

0.17885859414544655

In [None]:
# Define custom MAPE scorer
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = y_true.astype(np.int32)
    y_pred = y_pred.astype(np.int32)

    # Calculate absolute percentage error
    absolute_percentage_error = np.abs((y_true - y_pred) / y_true)

    # Handle division by zero (replace with zero)
    absolute_percentage_error = absolute_percentage_error[~np.isnan(absolute_percentage_error)]  # Remove NaNs

    # Calculate mean absolute percentage error
    mape = np.mean(absolute_percentage_error) * 100
    return mape

In [None]:
def match_column_types(df1, df2):
    """
    Match the column types of the second dataframe to those of the first dataframe.

    Args:
    - df1 (DataFrame): The first dataframe.
    - df2 (DataFrame): The second dataframe.

    Returns:
    - DataFrame: The second dataframe with column types matched to those of the first dataframe.
    """
    # Iterate through columns in the first dataframe
    for col in df1.columns:
        # Check if the column exists in the second dataframe
        if col in df2.columns:
            # Match the data type of the column in the second dataframe to that of the first dataframe
            df2[col] = df2[col].astype(df1[col].dtype)

    return df2
z = match_column_types(X_test, z)

In [None]:
%%time
# Predict with X_test data
y_xgb_prob = grid_search.predict(X_test)

# Predict with text data
z_xgb_prob = grid_search.predict(z)

# Calculate RMSE and MAPE
rmse = np.sqrt(mean_squared_error(y_test.astype(np.int16), y_xgb_prob.astype(np.int16)))
mape = mean_absolute_percentage_error(y_test, y_xgb_prob)

print("RMSE:", rmse)
print("MAPE:", mape)


RMSE: 0.3534652486936423
MAPE: 100.0
CPU times: user 397 ms, sys: 1 ms, total: 398 ms
Wall time: 431 ms


In [None]:
y_xgb_prob

array([0.00319639, 0.19011934, 0.2972872 , ..., 0.00319639, 0.00319639,
       0.00319639], dtype=float32)

In [None]:
rounded_arr = np.where(y_xgb_prob > 0.5, np.ceil(y_xgb_prob), np.floor(y_xgb_prob))

print(classification_report(y_test, rounded_arr))


              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.88      1.00      0.93     66699
         1.0       0.75      0.00      0.00      9523

    accuracy                           0.87     76222
   macro avg       0.54      0.33      0.31     76222
weighted avg       0.86      0.87      0.82     76222



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
text_df['xgb_prediction'] = z_xgb_prob
text_df.head(10)

Unnamed: 0,ID,Name,Description,xgb_prediction
0,191150,Julia,"Meet Julia, a 34-year-old woman living in the ...",0.028035
1,76655,Carlos,Carlos is a 52-year-old man who prides himself...,-0.025404
2,98173,Sophie,"At 28 years old, Sophie finally steps into car...",0.049164
3,906632,Victor,"Victor, aged 45, has always been a keen observ...",0.031144
4,803321,Emily,"Emily, a 29-year-old professional from region ...",0.198327
5,639179,Tom,"Tom, a 39-year-old resident of region code 256...",0.003196
6,784734,Linda,"Linda, a 47-year-old with a passion for road t...",0.003196
7,598542,Jeremy,"At 30, Jeremy has made a significant shift fro...",0.027683
8,282861,Fiona,"Fiona, a 26-year-old teacher in region code 32...",0.003196
9,69740,Mark,"Mark, at 55, has seen it all when it comes to ...",0.171834


In [None]:
result_df['xgb_prediction'] = y_xgb_prob
result_df.head(10)

Unnamed: 0,id,actual_response,xgb_prediction
200222,200223,0,0.003196
49766,49767,0,0.190119
172201,172202,0,0.297287
160713,160714,0,0.057874
53272,53273,0,0.267288
372603,372604,0,0.003196
216160,216161,0,0.295791
59206,59207,0,0.003196
26462,26463,0,0.180954
95043,95044,1,0.290791


In [None]:
result_df.to_csv("results.csv", index=False)
text_df.to_csv("text.csv", index=False)

# RandomForestClassifier



In [None]:
n_estimators = [700]
max_depth = [10, 8]
min_samples_split = [10, 5]
min_samples_leaf = [5, 3]

In [None]:
hyper_random = {"n_estimators":n_estimators,
                "max_depth":max_depth,
                "min_samples_split":min_samples_split,
                "min_samples_leaf":min_samples_leaf}

In [None]:
%%time
clf_rf_tuned = GridSearchCV(RandomForestClassifier(), hyper_random,
                            cv = 5, verbose = 1,
                            n_jobs = -1)
clf_rf_tuned.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  pid = os.fork()
  pid = os.fork()


CPU times: user 3min 52s, sys: 9.54 s, total: 4min 1s
Wall time: 1h 17min 49s


In [None]:
best_params_random = clf_rf_tuned.best_params_
print(best_params_random)

{'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 700}


In [None]:
CV_clf_rf = RandomForestClassifier(max_depth=best_params_random["max_depth"],
                                   min_samples_leaf=best_params_random["min_samples_leaf"],
                                   min_samples_split=best_params_random["min_samples_split"],
                                   n_estimators= best_params_random["n_estimators"])

In [None]:
%%time
# Train model
CV_clf_rf.fit(X_train, y_train)

# Predict with X_test data
y_test_predict_random = CV_clf_rf.predict_proba(X_test)[:, 1]
z_predict_random = CV_clf_rf.predict_proba(z)[:, 1]

yhat_random = CV_clf_rf.predict(X_test)
fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_test_predict_random, n_bins=10)

CPU times: user 3min 20s, sys: 245 ms, total: 3min 20s
Wall time: 3min 21s


In [None]:
print(classification_report(y_test, yhat_random))


              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.86      0.00      0.00      9523

    accuracy                           0.88     76222
   macro avg       0.87      0.50      0.47     76222
weighted avg       0.87      0.88      0.82     76222



In [None]:
# Calculate RMSE and MAPE
rmse = np.sqrt(mean_squared_error(y_test.astype(np.int16), y_xgb_prob.astype(np.int16)))
mape = mean_absolute_percentage_error(y_test, y_xgb_prob)

print("RMSE:", rmse)
print("MAPE:", mape)

RMSE: 0.3534652486936423
MAPE: 100.0


In [None]:
result_df['rf_prediction'] = y_test_predict_random
text_df['rf_prediction'] = z_predict_random

In [None]:
result_df.to_csv("results.csv", index=False)
text_df.to_csv("text.csv", index=False)


In [None]:
result_df[result_df.actual_response == 1].head(10)

Unnamed: 0,id,actual_response,xgb_prediction,rf_prediction
95043,95044,1,0.290791,0.297669
55640,55641,1,0.035913,0.031914
196603,196604,1,0.300147,0.308155
292232,292233,1,0.324631,0.328781
198094,198095,1,0.323598,0.324003
107941,107942,1,0.330999,0.335192
269651,269652,1,0.163519,0.197877
25377,25378,1,0.216523,0.134851
64439,64440,1,0.177557,0.172306
227812,227813,1,0.286755,0.296139


# Naive Bias

In [None]:
%%time
# Uncalibrated
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
y_test_predict_nb = clf_nb.predict_proba(X_test)[:, 1]
z_predict_nb = clf_nb.predict_proba(z)[:, 1]
yhat_nb = clf_nb.predict(X_test)
fraction_of_positives_nb, mean_predicted_value_nb = calibration_curve(y_test, y_test_predict_nb, n_bins=10)


CPU times: user 166 ms, sys: 0 ns, total: 166 ms
Wall time: 171 ms


In [None]:
print(classification_report(y_test, yhat_nb))


              precision    recall  f1-score   support

           0       0.99      0.64      0.78     66699
           1       0.28      0.95      0.43      9523

    accuracy                           0.68     76222
   macro avg       0.63      0.80      0.60     76222
weighted avg       0.90      0.68      0.74     76222



In [None]:
result_df['nb_prediction'] = y_test_predict_nb
text_df['nb_prediction'] = z_predict_nb
result_df.to_csv("results.csv", index=False)
text_df.to_csv("text.csv", index=False)


In [None]:
X_train.head()

Unnamed: 0,age,driving_license,region_code,previously_insured,vehicle_age,annual_premium,policy_sales_channel,tenure,is_female,is_damage
332803,0.666667,1,0.12355,0,0.5,0.093422,0.147158,0.750865,1,1
116248,0.333333,1,0.085181,0,0.5,0.037986,0.147158,0.065744,0,1
255005,0.0,1,0.113036,1,0.0,0.07937,1.0,0.539792,0,0
317474,0.0,1,0.170194,1,0.0,0.049259,1.0,0.923875,1,0
344212,1.0,1,0.042341,0,1.0,0.0,1.0,0.50173,0,1


In [None]:
%%time
# Calibrated
clf_sigmoid_nb = CalibratedClassifierCV(clf_nb, cv=10, method='isotonic')
clf_sigmoid_nb.fit(X_train, y_train)

y_test_predict_nb_calib = clf_sigmoid_nb.predict_proba(X_test)[:, 1]
z_predict_nb_calib = clf_sigmoid_nb.predict_proba(z)[:, 1]


yhat_calibrated_nb = clf_sigmoid_nb.predict(X_test)

fraction_of_positives_nb_calib, mean_predicted_value_nb_calib = calibration_curve(y_test, y_test_predict_nb_calib, n_bins=10)


CPU times: user 1.83 s, sys: 230 ms, total: 2.05 s
Wall time: 2.06 s


In [None]:
print(classification_report(y_test, yhat_calibrated_nb))


              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.27      0.00      0.00      9523

    accuracy                           0.87     76222
   macro avg       0.57      0.50      0.47     76222
weighted avg       0.80      0.87      0.82     76222



In [None]:
result_df['nb_isotonic_prediction'] = y_test_predict_nb_calib
result_df.to_csv("results.csv", index=False)
text_df['nb_isotonic_prediction'] = z_predict_nb
text_df.to_csv("text.csv", index=False)

In [None]:
%%time
# Calibrated, Platt
clf_sigmoid_nb_calib_sig = CalibratedClassifierCV(clf_nb, cv=10, method='sigmoid')
clf_sigmoid_nb_calib_sig.fit(X_train, y_train)

y_test_predict_nb_calib_platt = clf_sigmoid_nb_calib_sig.predict_proba(X_test)[:, 1]
z_predict_nb_calib_platt = clf_sigmoid_nb_calib_sig.predict_proba(z)[:, 1]

yhat_calibrated_platt = clf_sigmoid_nb_calib_sig.predict(X_test)

fraction_of_positives_nb_calib_platt, mean_predicted_value_nb_calib_platt = calibration_curve(y_test, y_test_predict_nb_calib_platt, n_bins=10)


CPU times: user 2.94 s, sys: 1.33 s, total: 4.27 s
Wall time: 3.78 s


In [None]:
print(classification_report(y_test, yhat_calibrated_platt))


              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.00      0.00      0.00      9523

    accuracy                           0.88     76222
   macro avg       0.44      0.50      0.47     76222
weighted avg       0.77      0.88      0.82     76222



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
result_df['nb_sigmoid_prediction'] = y_test_predict_nb_calib_platt
result_df.to_csv("results.csv", index=False)

text_df['nb_sigmoid_prediction'] = z_predict_nb_calib_platt
text_df.to_csv("text.csv", index=False)

In [None]:
text_df.head()

Unnamed: 0,ID,Name,Description,xgb_prediction,rf_prediction,nb_prediction,nb_isotonic_prediction,nb_sigmoid_prediction
0,191150,Julia,"Meet Julia, a 34-year-old woman living in the ...",0.028035,0.134127,0.0,0.0,0.01076
1,76655,Carlos,Carlos is a 52-year-old man who prides himself...,-0.025404,0.062272,0.0,0.0,0.01076
2,98173,Sophie,"At 28 years old, Sophie finally steps into car...",0.049164,0.108779,0.0,0.0,0.01076
3,906632,Victor,"Victor, aged 45, has always been a keen observ...",0.031144,0.119561,0.0,0.0,0.01076
4,803321,Emily,"Emily, a 29-year-old professional from region ...",0.198327,0.239086,0.0,0.0,0.01076


In [None]:
result_df['nb_prediction'] = result_df['nb_prediction'].apply(lambda x: '{:.5f}'.format(x))
result_df['nb_isotonic_prediction'] = result_df['nb_isotonic_prediction'].apply(lambda x: '{:.5f}'.format(x))
result_df['nb_sigmoid_prediction'] = result_df['nb_sigmoid_prediction'].apply(lambda x: '{:.5f}'.format(x))
result_df['rf_prediction'] = result_df['rf_prediction'].apply(lambda x: '{:.5f}'.format(x))
result_df['xgb_prediction'] = result_df['xgb_prediction'].apply(lambda x: '{:.5f}'.format(x))
######
text_df['nb_prediction'] = text_df['nb_prediction'].apply(lambda x: '{:.5f}'.format(x))
text_df['nb_isotonic_prediction'] = text_df['nb_isotonic_prediction'].apply(lambda x: '{:.5f}'.format(x))
text_df['nb_sigmoid_prediction'] = text_df['nb_sigmoid_prediction'].apply(lambda x: '{:.5f}'.format(x))
text_df['rf_prediction'] = text_df['rf_prediction'].apply(lambda x: '{:.5f}'.format(x))
text_df['xgb_prediction'] = text_df['xgb_prediction'].apply(lambda x: '{:.5f}'.format(x))

result_df.head(10)

Unnamed: 0,id,actual_response,xgb_prediction,rf_prediction,nb_prediction,nb_isotonic_prediction,nb_sigmoid_prediction
200222,200223,0,0.0032,0.00012,0.0,0.0011,0.01076
49766,49767,0,0.19012,0.19473,0.98278,0.2778,0.29495
172201,172202,0,0.29729,0.29677,0.97936,0.2778,0.29231
160713,160714,0,0.05787,0.04443,0.0,0.02891,0.01076
53272,53273,0,0.26729,0.28345,0.98492,0.2778,0.2966
372603,372604,0,0.0032,0.00025,0.0,0.0011,0.01076
216160,216161,0,0.29579,0.33251,0.99308,0.2824,0.30297
59206,59207,0,0.0032,0.00037,0.0,0.0011,0.01076
26462,26463,0,0.18095,0.17852,0.97951,0.2778,0.29243
95043,95044,1,0.29079,0.29767,0.99407,0.28363,0.30374


In [None]:
result_df.columns

Index(['id', 'actual_response', 'xgb_prediction', 'rf_prediction', 'nb_prediction', 'nb_isotonic_prediction', 'nb_sigmoid_prediction'], dtype='object')

In [None]:
result_df = reduce_mem_usage(result_df)
result_df.info()

Mem. usage decreased to  3.85 Mb (17.2% reduction)
<class 'pandas.core.frame.DataFrame'>
Index: 76222 entries, 200222 to 185839
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      76222 non-null  int32 
 1   actual_response         76222 non-null  int8  
 2   xgb_prediction          76222 non-null  object
 3   rf_prediction           76222 non-null  object
 4   nb_prediction           76222 non-null  object
 5   nb_isotonic_prediction  76222 non-null  object
 6   nb_sigmoid_prediction   76222 non-null  object
dtypes: int32(1), int8(1), object(5)
memory usage: 3.9+ MB


In [None]:
text_df = reduce_mem_usage(text_df)
text_df.info()

Mem. usage decreased to  0.00 Mb (5.2% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      10 non-null     int32 
 1   Name                    10 non-null     object
 2   Description             10 non-null     object
 3   xgb_prediction          10 non-null     object
 4   rf_prediction           10 non-null     object
 5   nb_prediction           10 non-null     object
 6   nb_isotonic_prediction  10 non-null     object
 7   nb_sigmoid_prediction   10 non-null     object
dtypes: int32(1), object(7)
memory usage: 728.0+ bytes


In [None]:
# Convert object-type columns in a DataFrame to float.

def convert_object_to_float(dataframe):

    for column in dataframe.columns:
        # Check if the column dtype is 'object'
        if dataframe[column].dtype == 'object':
            try:
                # Attempt to convert the column to float
                dataframe[column] = dataframe[column].astype(float)
            except ValueError:
                # Print a message if conversion fails
                print(f"Unable to convert column '{column}' to float.")
    return dataframe


In [None]:
result_df = convert_object_to_float(result_df)
text_df = convert_object_to_float(text_df)

text_df.info()

Unable to convert column 'Name' to float.
Unable to convert column 'Description' to float.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      10 non-null     int32  
 1   Name                    10 non-null     object 
 2   Description             10 non-null     object 
 3   xgb_prediction          10 non-null     float64
 4   rf_prediction           10 non-null     float64
 5   nb_prediction           10 non-null     float64
 6   nb_isotonic_prediction  10 non-null     float64
 7   nb_sigmoid_prediction   10 non-null     float64
dtypes: float64(5), int32(1), object(2)
memory usage: 728.0+ bytes


In [None]:
# Method for scoring the predictions
def calc_cancellation_score(final_df, target_col):

    # Get accuracy values
    xgb_accuracy = 0.87
    random_forest_accuracy = 0.88
    naive_bias_accuracy = 0.68
    isotonic_calibrated_naive_bias_accuracy = 0.87
    sigmoid_calibrated_naive_bias_accuracy = 0.88

    final_df[target_col] = (final_df['rf_prediction'] * float(random_forest_accuracy) +
                            final_df['xgb_prediction'] * float(xgb_accuracy) +
                            final_df['nb_prediction'] * float(naive_bias_accuracy) +
                            final_df['nb_isotonic_prediction'] * float(isotonic_calibrated_naive_bias_accuracy) +
                            final_df['nb_sigmoid_prediction'] * float(sigmoid_calibrated_naive_bias_accuracy))

    def NormalizeData(data):
        return (data - np.min(data)) / (np.max(data) - np.min(data))

    final_df[target_col] = NormalizeData(final_df[target_col])

    return final_df

In [None]:
result_df = calc_cancellation_score(result_df, 'response_score')
text_df = calc_cancellation_score(text_df, 'response_score')

result_df.head()

Unnamed: 0,id,actual_response,xgb_prediction,rf_prediction,nb_prediction,nb_isotonic_prediction,nb_sigmoid_prediction,response_score
200222,200223,0,0.0032,0.00012,0.0,0.0011,0.01076,0.005294
49766,49767,0,0.19012,0.19473,0.98278,0.2778,0.29495,0.724229
172201,172202,0,0.29729,0.29677,0.97936,0.2778,0.29231,0.810128
160713,160714,0,0.05787,0.04443,0.0,0.02891,0.01076,0.058625
53272,53273,0,0.26729,0.28345,0.98492,0.2778,0.2966,0.795554


In [None]:
# Binarize the 'response_score' column in the DataFrame based on the given threshold.

def binarize_response_score(df, col, threshold):

    # Apply binarization to 'response_score' column
    df[col] = df['response_score'].apply(lambda x: 1 if x > threshold else 0)

    return df


In [None]:
result_df = binarize_response_score(result_df, 'response_v05', 0.5)
result_df = binarize_response_score(result_df, 'response_v07', 0.7)
result_df = binarize_response_score(result_df, 'response_v09', 0.9)

####

text_df = binarize_response_score(text_df, 'response_v05', 0.5)
text_df = binarize_response_score(text_df, 'response_v07', 0.7)
text_df = binarize_response_score(text_df, 'response_v09', 0.9)

In [None]:
print('Report for 0.5')

print(classification_report(result_df.actual_response, result_df.response_v05))

print('Report for 0.7')

print(classification_report(result_df.actual_response, result_df.response_v07))

print('Report for 0.9')

print(classification_report(result_df.actual_response, result_df.response_v09))


Report for 0.5
              precision    recall  f1-score   support

           0       0.98      0.69      0.81     66699
           1       0.29      0.89      0.44      9523

    accuracy                           0.71     76222
   macro avg       0.63      0.79      0.62     76222
weighted avg       0.89      0.71      0.76     76222

Report for 0.7
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     66699
           1       0.30      0.86      0.44      9523

    accuracy                           0.73     76222
   macro avg       0.63      0.78      0.63     76222
weighted avg       0.89      0.73      0.77     76222

Report for 0.9
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.46      0.03      0.05      9523

    accuracy                           0.87     76222
   macro avg       0.67      0.51      0.49     76222
weighted avg       0.82      

In [None]:
print('Actual 0 case = ', result_df[result_df.actual_response == 0].shape[0])
print('True 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v05 == 0)].shape[0])
print('False 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v05 == 1)].shape[0])
print('Actual 1 case = ', result_df[result_df.actual_response == 1].shape[0])
print('True 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v05 == 1)].shape[0])
print('False 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v05 == 0)].shape[0])

Actual 0 case =  66699
True 0 case =  45946
False 0 case =  20753
Actual 1 case =  9523
True 1 case =  8496
False 1 case =  1027


In [None]:
print('Actual 0 case = ', result_df[result_df.actual_response == 0].shape[0])
print('True 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v07 == 0)].shape[0])
print('False 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v07 == 1)].shape[0])
print('Actual 1 case = ', result_df[result_df.actual_response == 1].shape[0])
print('True 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v07 == 1)].shape[0])
print('False 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v07 == 0)].shape[0])

Actual 0 case =  66699
True 0 case =  47142
False 0 case =  19557
Actual 1 case =  9523
True 1 case =  8218
False 1 case =  1305


In [None]:
print('Actual 0 case = ', result_df[result_df.actual_response == 0].shape[0])
print('True 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v09 == 0)].shape[0])
print('False 0 case = ', result_df[(result_df.actual_response == 0 ) & (result_df.response_v09 == 1)].shape[0])
print('Actual 1 case = ', result_df[result_df.actual_response == 1].shape[0])
print('True 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v09 == 1)].shape[0])
print('False 1 case = ', result_df[(result_df.actual_response == 1) & (result_df.response_v09 == 0)].shape[0])

Actual 0 case =  66699
True 0 case =  66395
False 0 case =  304
Actual 1 case =  9523
True 1 case =  255
False 1 case =  9268


In [None]:
result_df.to_csv("results.csv", index=False)
text_df.to_csv("text.csv", index=False)

In [None]:
text_df.columns

Index(['ID', 'Name', 'Description', 'xgb_prediction', 'rf_prediction', 'nb_prediction', 'nb_isotonic_prediction', 'nb_sigmoid_prediction', 'response_score', 'response_v05', 'response_v07', 'response_v09'], dtype='object')