In [5]:
# !pip install pandas
# !pip install scikit-learn
# !pip install imbalanced-learn
# !pip install openpyxl --upgrade
# !pip install matplotlib
# !pip install missingno
# !pip install -q xlrd
# !pip install xgboost

In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score
import xgboost as xgb

In [85]:
# Load the dataset
data = pd.read_excel('Dataset_Public.xlsx')

In [86]:
data.shape

(191690, 3)

In [87]:
data.info

<bound method DataFrame.info of                                         Claim Description Coverage Code  \
0       THE IV WAS MAKING A LEFT TURN ON A GREEN ARROW...            AN   
1       CLAIMANT ALLEGES SHE SUFFERED INJURIES IN AN E...            GB   
2       IV PASSENGER SUSTAINED INJURIES, OV AND IV COL...            AB   
3       CLAIMANT ALLEGES SHE WAS BURNED UNKNOWN DEGREE...            PA   
4       THE IV WAS MERGING INTO A CONSTRUCTION ZONE WH...            AD   
...                                                   ...           ...   
191685            SOME WOOD FELL OVER AND HIT GUESTS LEGS            GB   
191686  FEMALE FELL DOWN ON PLATFORM.                 ...            AB   
191687  IWAS DRIVING DOWNHILL IN CUSTOMER YARD ON WET ...            AD   
191688  DONNA GEORGE WAS WALKING UP TO REGISTER 2 AND ...            PM   
191689  UNIT 3 WAS TRAVELING EASTBOUND ON SH 97 BEHIND...            AB   

                   Accident Source  
0       Struck pedestrian, bic

In [88]:
# Data Cleaning for "Claim Description" column
def clean_text(text):
    # Remove special characters, numbers, and extra whitespaces
    text = ' '.join(word for word in text.split() if word.isalpha())
    # Convert text to lowercase
    text = text.lower()
    return text

In [89]:
# show nan values 
data.isna().sum()

Claim Description    227
Coverage Code          0
Accident Source        0
dtype: int64

In [90]:
# Handling Missing Values (if any)
data.dropna(subset=['Claim Description'], inplace=True)

In [91]:
# show nan values  after drop nan values
data.isna().sum()

Claim Description    0
Coverage Code        0
Accident Source      0
dtype: int64

In [92]:
# clean text in Claim Description
data['Claim Description'] = data['Claim Description'].apply(clean_text)

In [94]:
# Encode Categorical Target Variables
label_encoder_coverage = LabelEncoder()
label_encoder_accident = LabelEncoder()

data['Coverage Code'] = label_encoder_coverage.fit_transform(data['Coverage Code'])
data['Accident Source'] = label_encoder_accident.fit_transform(data['Accident Source'])

In [95]:
# Split the dataset into features (X) and target variables (y)
X = data['Claim Description']
y_coverage = data['Coverage Code']
y_accident_source = data['Accident Source']

# Feature Engineering: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=50,min_df=1,stop_words='english')
X_train_tfidf_ = tfidf_vectorizer.fit_transform(X)

In [96]:
# Split the data into training and testing sets for "Coverage Code"
X_train_coverage, X_test_coverage, y_train_coverage, y_test_coverage = train_test_split(X_train_tfidf_, y_coverage, test_size=0.2, random_state=42)

# Split the data into training and testing sets for "Accident Source"
X_train_accident_source, X_test_accident_source, y_train_accident_source, y_test_accident_source = train_test_split(X_train_tfidf_, y_accident_source, test_size=0.2, random_state=42)



In [55]:
# Assuming X_train and y_train are your training data
# ros = RandomOverSampler(random_state=42)
# X_train_coverage, y_train_coverage = ros.fit_resample(X_train_coverage, y_train_coverage)
# X_train_accident_source, y_train_accident_source = ros.fit_resample(X_train_accident_source, y_train_accident_source)

# Check the class distribution after oversampling
# print("Class distribution after oversampling: ", Counter(y_train_as))

In [101]:
# Create and train XGBoost models for "Coverage Code" and "Accident Source"
# from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search
# param_grid_coverage = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2]
# }

# Create an XGBoost classifier
# xgb_model_coverage = xgb.XGBClassifier()

# Perform Grid Search to find the best hyperparameters
# grid_search_coverage = GridSearchCV(xgb_model_coverage, param_grid=param_grid_coverage, scoring='accuracy', cv=3, n_jobs=-1)
# grid_search_coverage.fit(X_train_tfidf_coverage, y_train_coverage)

# Get the best hyperparameters
# best_params_coverage = grid_search_coverage.best_params_

# Train the model with the best hyperparameters
best_xgb_model_coverage = xgb.XGBClassifier(n_estimators=100,max_depth=5,learning_rate=0.1)
best_xgb_model_coverage.fit(X_train_coverage, y_train_coverage)

# Make predictions and calculate precision and recall
y_pred_coverage = best_xgb_model_coverage.predict(X_test_coverage)
precision_coverage = precision_score(y_test_coverage, y_pred_coverage, average='weighted')
recall_coverage = recall_score(y_test_coverage, y_pred_coverage, average='weighted')



  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
# Create an XGBoost classifier
# param_grid_accident_source = xgb.XGBClassifier()

# grid_search_accident_source = GridSearchCV(xgb_model_accident_source, param_grid=param_grid_accident_source, scoring='accuracy', cv=3, n_jobs=-1)
# grid_search_accident_source.fit(X_train_tfidf_accident_source, y_train_accident_source)

# best_params_accident_source = grid_search_accident_source.best_params_

best_xgb_model_accident_source = xgb.XGBClassifier(n_estimators=100,max_depth=5,learning_rate=0.1)
best_xgb_model_accident_source.fit(X_train_accident_source, y_train_accident_source)

y_pred_accident_source = best_xgb_model_accident_source.predict(X_test_accident_source)
precision_accident_source = precision_score(y_test_accident_source, y_pred_accident_source, average='weighted')
recall_accident_source = recall_score(y_test_accident_source, y_pred_accident_source, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
# Print the results
print("Coverage Code Precision:", round(precision_coverage * 100, 2) )
print("Coverage Code Recall:", round(recall_coverage * 100, 2) )

print("Accident Source Precision:", round(precision_accident_source * 100, 2) )
print("Accident Source Recall:", round(recall_accident_source * 100, 2) )

Coverage Code Precision: 52.63
Coverage Code Recall: 54.13
Accident Source Precision: 31.23
Accident Source Recall: 31.79


In [60]:
# import joblib
import pickle

# Save the multi-output classifier model to a file
pickle.dump(best_xgb_model_coverage, open("xgbModel_cc.pkl", "wb"))
# joblib.dump(best_xgb_model_coverage, 'xgbModel_cc.pkl', "wb")
pickle.dump(best_xgb_model_accident_source, open("xgbModel_as.pkl", "wb"))