# Grid Search on AMD Ryzen 9 5950X 16-Core

**Imports**

In [3]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
plt.style.use('seaborn')

import xgboost as xgb

## Auto Claims Data





![](https://images.propertycasualty360.com/contrib/content/uploads/sites/414/2018/05/Telemactics_Feature_Car.jpg)

**[Dataset](https://www.kaggle.com/xiaomengsun/car-insurance-claim-data)**

|VARIABLE NAME|DEFINITION                              |THEORETICAL EFFECT                                                                               |
|-------------|----------------------------------------|-------------------------------------------------------------------------------------------------|
|INDEX        |Identification Variable (do not use)    |None                                                                                             |
|TARGET FLAG  |Was Car in a crash? 1=YES 0=NO          |None                                                                                             |
|TARGET AMT   |If car was in a crash, what was the cost|None                                                                                             |
|AGE          |Age of Driver                           |Very young people tend to be risky. Maybe very old people also.                                  |
|BLUEBOOK     |Value of Vehicle                        |Unknown effect on probability of collision, but probably effect the payout if there is a crash   |
|CAR AGE      |Vehicle Age                             |Unknown effect on probability of collision, but probably effect the payout if there is a crash   |
|CAR TYPE     |Type of Car                             |Unknown effect on probability of collision, but probably effect the payout if there is a crash   |
|CAR USE      |Vehicle Use                             |Commercial vehicles are driven more, so might increase probability of collision                  |
|CLM FREQ     |# Claims (Past 5 Years)                 |The more claims you filed in the past, the more you are likely to file in the future             |
|EDUCATION    |Max Education Level                     |Unknown effect, but in theory more educated people tend to drive more safely                     |
|HOMEKIDS     |# Children at Home                      |Unknown effect                                                                                   |
|HOME VAL     |Home Value                              |In theory, home owners tend to drive more responsibly                                            |
|INCOME       |Income                                  |In theory, rich people tend to get into fewer crashes                                            |
|JOB          |Job Category                            |In theory, white collar jobs tend to be safer                                                    |
|KIDSDRIV     |# Driving Children                      |When teenagers drive your car, you are more likely to get into crashes                           |
|MSTATUS      |Marital Status                          |In theory, married people drive more safely                                                      |
|MVR PTS      |Motor Vehicle Record Points             |If you get lots of traffic tickets, you tend to get into more crashes                            |
|OLDCLAIM     |Total Claims (Past 5 Years)             |If your total payout over the past five years was high, this suggests future payouts will be high|
|PARENT1      |Single Parent                           |Unknown effect                                                                                   |
|RED CAR      |A Red Car                               |Urban legend says that red cars (especially red sports cars) are more risky. Is that true?       |
|REVOKED      |License Revoked (Past 7 Years)          |If your license was revoked in the past 7 years, you probably are a more risky driver.           |
|SEX          |Gender                                  |Urban legend says that women have less crashes then men. Is that true?                           |
|TIF          |Time in Force                           |People who have been customers for a long time are usually more safe.                            |
|TRAVTIME     |Distance to Work                        |Long drives to work usually suggest greater risk                                                 |
|URBANICITY   |Home/Work Area                          |Unknown                                                                                          |
|YOJ          |Years on Job                            |People who stay at a job for a long time are usually more safe                                   |


In [4]:
# Load Data
df = pd.read_csv('data/car_insurance_claim.csv')

# make columns lowercase
df.columns = df.columns.str.lower()

# drop useless columns
df = df.drop(['kidsdriv','parent1','revoked','mvr_pts','travtime','id','birth'],axis=1)

# clean money amounts
df[['home_val','bluebook','oldclaim','clm_amt','income']] = df[['home_val','bluebook','oldclaim','clm_amt','income']].apply(lambda x: x.str.replace('$','',regex=False).str.replace(',','',regex=False)).astype(float)

# clean values from columns
to_clean = ['education','occupation','mstatus','gender','car_type']
for col in to_clean:
    df[col] = df[col].str.replace('z_','',regex=False).str.replace('<','',regex=False)

df['urbanicity'] = df['urbanicity'].str.split('/ ',expand=True)[1]

to_clean = ['mstatus','red_car']
for col in to_clean:
    df[col] = df[col].str.lower().replace({ 'yes': True, 'no': False}).astype(int)
    
df = df.drop(['car_age','occupation','home_val','income','yoj'],axis=1).dropna()
  
df[:3]

Unnamed: 0,age,homekids,mstatus,gender,education,car_use,bluebook,tif,car_type,red_car,oldclaim,clm_freq,clm_amt,claim_flag,urbanicity
0,60.0,0,0,M,PhD,Private,14230.0,11,Minivan,1,4461.0,2,0.0,0,Urban
1,43.0,0,0,M,High School,Commercial,14940.0,1,Minivan,1,0.0,0,0.0,0,Urban
2,48.0,0,0,M,Bachelors,Private,21970.0,1,Van,1,0.0,0,0.0,0,Urban


In [5]:
processed_df = df.copy()
processed_df[['gender','education','car_use','car_type','urbanicity']] = processed_df[['gender','education','car_use','car_type','urbanicity']].apply(LabelEncoder().fit_transform)

In [6]:
train,test = train_test_split(processed_df, test_size=0.33, random_state=42, stratify=processed_df['claim_flag'])

In [7]:
features = ['age','homekids','mstatus','gender','education','car_use','bluebook','tif','car_type','red_car','clm_amt','urbanicity']
target = 'clm_amt'

## Default Tweedie Model

In [8]:
model = xgb.XGBRegressor(objective='reg:tweedie')
model.fit(train[features],train[target])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=32, num_parallel_tree=1,
             objective='reg:tweedie', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=None, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [9]:
test_preds = model.predict(test[features])



In [10]:
print('RMSE:', metrics.mean_squared_error(test[target], test_preds, squared=False))

RMSE: 24.143000250200103


## Grid Search

In [11]:
param_grid = {
    'learning_rate'    : [0.05, 0.10 ],
    'max_depth'        : list(range(1,13,2)),
    'min_child_weight' : [ 3, 5, 7 ],
    'gamma'            : [ 0.1, 0.3],
    'colsample_bytree' : [ 0.3, 0.5 , 0.7 ],
    'n_estimators' : list(range(50,120,15)),
    'objective': ['reg:tweedie'],
    'tweedie_variance_power': [1.5],
    'eval_metric': ['tweedie-nloglik@1.5']
}
param_grid

{'learning_rate': [0.05, 0.1],
 'max_depth': [1, 3, 5, 7, 9, 11],
 'min_child_weight': [3, 5, 7],
 'gamma': [0.1, 0.3],
 'colsample_bytree': [0.3, 0.5, 0.7],
 'n_estimators': [50, 65, 80, 95, 110],
 'objective': ['reg:tweedie'],
 'tweedie_variance_power': [1.5],
 'eval_metric': ['tweedie-nloglik@1.5']}

In [12]:
grid_model = xgb.XGBRegressor()
grid = GridSearchCV(
    estimator=grid_model,
    param_grid=param_grid,
    n_jobs=-1,
    cv=3,
    verbose=10
)

In [13]:
grid.fit(train[features],train[target])

Fitting 3 folds for each of 1080 candidates, totalling 3240 fits


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.5, 0.7],
                         'eval_metric': ['tweedie-nloglik@1.5'],
        

In [16]:
grid.best_params_

{'colsample_bytree': 0.7,
 'eval_metric': 'tweedie-nloglik@1.5',
 'gamma': 0.3,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 7,
 'n_estimators': 110,
 'objective': 'reg:tweedie',
 'tweedie_variance_power': 1.5}

In [18]:
best_estimator_preds = grid.best_estimator_.predict(test[features])



In [19]:
print('RMSE:', metrics.mean_squared_error(test[target], best_estimator_preds, squared=False))

RMSE: 232.32622429061522
