# Model deployment 

In [7]:
# !pip install catboost

In [8]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from catboost import CatBoostClassifier, Pool



import warnings
warnings.filterwarnings('ignore')

from utils import *

In [9]:
df = pd.read_csv("data/train_new_feats.csv")


In [10]:
target = [[f"target_{i}" for i in range(1, 9)] + ["Claim Injury Type"] + ["WCB Decision"] + ["Agreement Reached"] + ["Claim Injury Type_encoded"]]
target = [item for sublist in target for item in sublist]
target

binary_target = [f"target_{i}" for i in range(1, 9)]

original_target  = [col for col in target if col not in binary_target]

ordinal_target = ["Claim Injury Type_encoded"]

features = [feat for feat in df.columns if feat not in target]

features = [feat for feat in features if df[feat].dtype != "datetime64[ns]"]

num_feats = [feat for feat in features if df[feat].dtype != "object"]

cat_feats = [feat for feat in features if df[feat].dtype == "object"]
cat_feats_index = [features.index(feat) for feat in cat_feats]

In [11]:



selected_features = [
    'Age at Injury',
    'Attorney/Representative',
    'IME-4 Count',
    'Accident Date_year',
    'Accident Date_assembly_gap_days',
    'C3-C2_gap_days',
    'C2_missing',
    'C3_missing',
    'C3_Accident_gap_weeks',
    'Hearing_C3 gap_months',
    'Hearing_C2 gap_months',
    'Days to Assembly',
    'Days to First Hearing',
    'Average Weekly Wage_log',
    'Carrier Name_encoded',
    'County of Injury_encoded',
    'Industry Code Description_encoded',
    'WCIO Cause of Injury Description_encoded',
    'WCIO Nature of Injury Description_encoded',
    'WCIO Part Of Body Description_encoded',
    'Zip Code_encoded',
    'County of Worker_encoded',
    'Carrier Name_freq',
    'County of Injury_freq',
    'District Name_freq',
    'Industry Code Description_freq',
    'WCIO Cause of Injury Description_freq',
    'WCIO Nature of Injury Description_freq',
    'WCIO Part Of Body Description_freq',
    'Zip Code_freq',
    'County of Worker_freq'
 ]


naive_features = [feat.replace("_encoded", "") for feat in selected_features]
naive_features = [feat.replace(f"_freq", "") for feat in naive_features]
naive_features = set(naive_features)
naive_features = list(naive_features)

cat_feats = [feat for feat in naive_features if feat in cat_feats]


In [None]:

X = df[naive_features]
y = df[ordinal_target]

X_encoded = X.copy()
X_encoded_ = X.copy()



In [13]:

print(f"Ordinal encoding...")
X_train_encoded = X_encoded.copy()
X_val_encoded = X_encoded_.copy()
for cat in cat_feats:
    X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y, 0)

print(f"Frequency encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)


X_train_encoded  = X_train_encoded[selected_features]
X_val_encoded = X_val_encoded[selected_features]

print(f"Numerical imputing...")
X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)

Ordinal encoding...
Frequency encoding...
Numerical imputing...


In [14]:
clf = CatBoostClassifier(
    random_state=42, 
    verbose=100, 
    iterations=1000, 
    depth=6, 
    boosting_type='Ordered', 
    auto_class_weights='SqrtBalanced', 
    loss_function="MultiClassOneVsAll"
    )

clf.fit(X_train_imputed, y)

0:	learn: 0.6714078	total: 663ms	remaining: 11m 2s
100:	learn: 0.2068660	total: 42s	remaining: 6m 13s
200:	learn: 0.1829458	total: 1m 26s	remaining: 5m 43s
300:	learn: 0.1762721	total: 2m 9s	remaining: 5m 1s
400:	learn: 0.1726725	total: 2m 52s	remaining: 4m 17s
500:	learn: 0.1702044	total: 3m 35s	remaining: 3m 34s
600:	learn: 0.1682870	total: 4m 18s	remaining: 2m 51s
700:	learn: 0.1667047	total: 5m 3s	remaining: 2m 9s
800:	learn: 0.1655144	total: 5m 47s	remaining: 1m 26s
900:	learn: 0.1644372	total: 6m 30s	remaining: 42.9s
999:	learn: 0.1635393	total: 7m 14s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2b7a345aa50>

In [35]:
test = pd.read_csv('data/test_data.csv')
test.head(10)

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
0,2022-12-24,19,N,2023-01-02,N,,2003.0,2023-01-02,,INDEMNITY INSURANCE CO OF,...,IV,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,10466,1
1,2022-11-20,19,N,2023-01-02,N,,2003.0,2023-01-02,,A I U INSURANCE COMPANY,...,IV,,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,11691,1
2,2022-12-26,59,N,2023-01-02,N,0.0,1963.0,2022-12-31,,AMGUARD INSURANCE COMPANY,...,III,,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,10604,0
3,2022-12-28,55,N,2023-01-02,N,0.0,0.0,2023-01-02,,INDEMNITY INS. OF N AMERICA,...,IV,,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,11411,6
4,2022-12-20,25,N,2023-01-02,N,0.0,1997.0,2022-12-31,,NEW HAMPSHIRE INSURANCE CO,...,IV,,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,11212,5
5,2022-12-28,36,N,2023-01-02,N,0.0,1986.0,2023-01-02,,NYC TRANSIT AUTHORITY,...,III,,90.0,OTHER THAN PHYSICAL CAUSE OF INJURY,77.0,MENTAL STRESS,66.0,NO PHYSICAL INJURY,10941,4
6,2022-12-22,19,N,2023-01-02,N,688.2,2003.0,2022-12-30,,"WAL-MART ASSOCIATES, INC.",...,I,,56.0,LIFTING,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14131,6
7,2022-12-13,43,N,2023-01-02,N,0.0,0.0,2023-01-02,,ERIE INSURANCE CO OF NY,...,I,,27.0,FROM LIQUID OR GREASE SPILLS,49.0,SPRAIN OR TEAR,53.0,KNEE,13357,4
8,2022-12-28,40,N,2023-01-02,N,0.0,1982.0,2022-12-31,,STARR INDEMNITY & LIABILITY CO,...,IV,,87.0,FOREIGN MATTER (BODY) IN EYE(S),25.0,FOREIGN BODY,14.0,EYE(S),11735,3
9,2022-11-01,48,N,2023-01-02,Y,1180.74,1974.0,2023-01-02,2023-01-09,STATE INSURANCE FUND,...,I,,25.0,FROM DIFFERENT LEVEL (ELEVATION),49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14720,0


In [36]:
test.shape

(387975, 30)

In [37]:
def test_preprocessing(train, test, cat_feats, selected_features, y):
    
    # ------------- Missing values / data types 
    
    test["IME-4 Count"] = np.where(test["IME-4 Count"].isna(), 0, test["IME-4 Count"])
    
    test["Industry Code Description"] = np.where(test["Industry Code Description"].isna(), train["Industry Code Description"].min(), test["Industry Code Description"])
    
    
    test["WCIO Cause of Injury Description"] = np.where(test["WCIO Cause of Injury Description"].isna(), "Missing", test["WCIO Cause of Injury Description"])
    
    
    test["WCIO Nature of Injury Description"] = np.where(test["WCIO Nature of Injury Description"].isna(), "Missing", test["WCIO Nature of Injury Description"])
    
    test["WCIO Part Of Body Description"] = np.where(test["WCIO Part Of Body Description"].isna(), "Missing", test["WCIO Part Of Body Description"])
    
    test["Zip Code"] = test["Zip Code"].astype(str).fillna("missing")
    

    test["Alternative Dispute Resolution"] = np.where(test["Alternative Dispute Resolution"] == "U", "Y", test["Alternative Dispute Resolution"])
    test["Alternative Dispute Resolution"] = test["Alternative Dispute Resolution"].map({"Y": True, "N": False})


    test["Attorney/Representative"] = test["Attorney/Representative"].map({"Y": True, "N": False})

    test["Zip Code"] = np.where(test["Zip Code"].str.isnumeric() == False, "missing", test["Zip Code"])
    
    
    
    
    
    # --------------- Feature engineering 
    
    for feat in test.columns:
        if "Date" in feat:
            test[feat] = pd.to_datetime(test[feat], format="%Y-%m-%d")

    test["Accident Date_year"] = test["Accident Date"].dt.year

    test["Accident Date_assembly_gap_days"] = (test["Assembly Date"] - test["Accident Date"]).dt.days

    test["C3_missing"] = np.where(test["C-3 Date"].isna(), True, False)
    
    test["C2_missing"] = np.where(test["C-2 Date"].isna(), True, False)

    test["C3-C2_gap_days"] = (test["C-3 Date"] - test["C-2 Date"]).dt.days

    test["C3_Accident_gap_weeks"] = ((test["C-3 Date"] - test["Accident Date"]).dt.days/7)

    test["Hearing_C3 gap_months"] = ((test["First Hearing Date"].dt.year - test["C-3 Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["C-3 Date"].dt.month))

    test["Hearing_C2 gap_months"] = ((test["First Hearing Date"].dt.year - test["C-2 Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["C-2 Date"].dt.month))

    
    test["Days to Assembly"] = (test["Assembly Date"] - pd.to_datetime('2020-01-01 00:00:00', format="%Y-%m-%d %H:%M:%S")).dt.days
    
        
    test["Days to First Hearing"] = (test["First Hearing Date"] - pd.to_datetime('2020-01-30 00:00:00', format="%Y-%m-%d %H:%M:%S")).dt.days
    
    
    test["Average Weekly Wage_log"] = test["Average Weekly Wage"].apply(lambda x: np.log1p(x))
    
    
    geo_df = pd.read_csv("data/geo-data.csv")
    geo_df_unique = geo_df.drop_duplicates(subset='zipcode')
    test = test.merge(geo_df_unique[["zipcode", "county"]], how="left", left_on="Zip Code", right_on="zipcode")
    test = test.rename(columns={"county": "County of Worker"})
    test["County of Worker"] = test["County of Worker"].str.upper()
    



    # --------------- Outliers  

    test["Birth Year"] = np.where(test["Birth Year"] == 0, test["Accident Date"].dt.year - test["Age at Injury"], test["Birth Year"])
    test["Birth Year"] = pd.to_datetime(test["Birth Year"], format="%Y")
    
    
    test["Accident Date_assembly_gap_days"] = np.where(test["Accident Date_assembly_gap_days"] > 30, 30, test["Accident Date_assembly_gap_days"])
    
    
    test["C3-C2_gap_days"] = np.where(test["C3-C2_gap_days"] < -60, -60, test["C3-C2_gap_days"])
    test["C3-C2_gap_days"] = np.where(test["C3-C2_gap_days"] > 60, 60, test["C3-C2_gap_days"])
    
    
    test["C3_Accident_gap_weeks"] = np.where(test["C3_Accident_gap_weeks"] < -4, -4, test["C3_Accident_gap_weeks"])
    test["C3_Accident_gap_weeks"] = np.where(test["C3_Accident_gap_weeks"] > 24, 24, test["C3_Accident_gap_weeks"])
    
    

    test["Hearing_C3 gap_months"] = np.where(test["Hearing_C3 gap_months"] > 50, 50, test["Hearing_C3 gap_months"])
    test["Hearing_C3 gap_months"] = np.where(test["Hearing_C3 gap_months"] < -20, -20, test["Hearing_C3 gap_months"])
    
    
    test["Hearing_C2 gap_months"] = np.where(test["Hearing_C2 gap_months"] > 50, 50, test["Hearing_C2 gap_months"])
    test["Hearing_C2 gap_months"] = np.where(test["Hearing_C2 gap_months"] < -20, -20, test["Hearing_C2 gap_months"])
    
    

    test["Age at Injury"] = np.where(test["Age at Injury"] == 0, test["Accident Date"].dt.year - test["Birth Year"].dt.year, test["Age at Injury"])
    test["Birth Year"] = np.where(test["Birth Year"].isna() & ~(test["Accident Date"].isna()), pd.to_datetime(test["Accident Date"].dt.year - test["Age at Injury"], format="%Y"), test["Birth Year"])
    test["Age at Injury"] = np.where(test["Age at Injury"] == 0, test["Accident Date"].dt.year - test["Birth Year"].dt.year, test["Age at Injury"])
    
    test["Age at Injury"] = np.where(test["Age at Injury"] < 14, np.nan, test["Age at Injury"])
    test["Age at Injury"] = np.where(test["Age at Injury"] > 90, 90, test["Age at Injury"])
    
    
    test["IME-4 Count"] = np.where(test["IME-4 Count"] > 12, 12, test["IME-4 Count"])



    train_categories = train["Carrier Name"].value_counts().index
    test["Carrier Name"] = np.where(~test["Carrier Name"].isin(train_categories), "Other", test["Carrier Name"])


    train_categories = train["Carrier Type"].value_counts().index
    test["Carrier Type"] = np.where(~test["Carrier Type"].isin(train_categories), "5D. SPECIAL FUND - UNKNOWN", test["Carrier Type"])
    



    # --------------- Categorical encoding
    print(f"Ordinal encoding...")
    X_train_encoded = train.copy()
    X_val_encoded = test.copy()
    for cat in cat_feats:
        X_train_encoded, X_val_encoded, _ = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y, 0)
        
        
    # --------------- Frequency encoding
    print(f"Frequency encoding...")
    for cat in cat_feats:
        X_train_encoded, X_val_encoded, _ = frequency_encoding(X_train_encoded, X_val_encoded, cat)

    # --------------- Selecting features
    print(f"Selecting features...")
    X_train_encoded  = X_train_encoded[selected_features]
    X_val_encoded = X_val_encoded[selected_features]
    
    
    # --------------- Missing values imputation
    print(f"Imputing missing values...")
    X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)
    
    
    return X_val_imputed


In [38]:
test_processed = test_preprocessing(df, test, cat_feats, selected_features, y)

test_processed_df = pd.DataFrame(test_processed, columns=selected_features)



Ordinal encoding...
Frequency encoding...
Selecting features...
Imputing missing values...


In [39]:
test_processed

Unnamed: 0,Age at Injury,Attorney/Representative,IME-4 Count,Accident Date_year,Accident Date_assembly_gap_days,C3-C2_gap_days,C2_missing,C3_missing,C3_Accident_gap_weeks,Hearing_C3 gap_months,...,County of Worker_encoded,Carrier Name_freq,County of Injury_freq,District Name_freq,Industry Code Description_freq,WCIO Cause of Injury Description_freq,WCIO Nature of Injury Description_freq,WCIO Part Of Body Description_freq,Zip Code_freq,County of Worker_freq
0,19.0,False,0.0,2022.0,9.0,1.0,False,True,1.000000,1.0,...,350.0,0.015897,0.069015,0.471740,0.094110,0.055279,0.192878,0.020246,0.003984,0.060785
1,19.0,False,0.0,2022.0,30.0,1.0,False,True,1.000000,1.0,...,367.0,0.019228,0.101618,0.471740,0.107380,0.031837,0.192878,0.013831,0.002451,0.088185
2,59.0,False,0.0,2022.0,7.0,1.0,False,True,1.000000,1.0,...,299.0,0.000355,0.044863,0.471740,0.036633,0.018955,0.097188,0.002022,0.000498,0.037470
3,55.0,False,0.0,2022.0,5.0,1.0,False,True,1.000000,1.0,...,367.0,0.025026,0.101618,0.471740,0.094110,0.020855,0.192878,0.083452,0.001352,0.088185
4,25.0,False,0.0,2022.0,13.0,1.0,False,True,1.000000,1.0,...,365.0,0.022186,0.092690,0.471740,0.000645,0.044060,0.082151,0.021819,0.004362,0.077359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387970,52.0,False,0.0,2012.0,30.0,1.0,False,True,1.000000,1.0,...,312.0,0.193623,0.005641,0.077773,0.046090,0.027248,0.027276,0.029759,0.000437,0.004961
387971,59.0,True,0.0,2024.0,14.0,1.0,True,False,0.857143,1.0,...,364.0,0.014326,0.105269,0.105454,0.046090,0.027248,0.027276,0.029759,0.001925,0.095293
387972,45.0,True,0.0,2024.0,30.0,1.0,True,True,1.000000,1.0,...,367.0,0.193623,0.101618,0.471740,0.046090,0.027248,0.027276,0.029759,0.004447,0.088185
387973,42.0,True,0.0,2024.0,30.0,1.0,True,False,12.428571,1.0,...,367.0,0.012755,0.101618,0.471740,0.046090,0.027248,0.027276,0.029759,0.002108,0.088185


In [40]:
test_processed_df.columns


Index(['Age at Injury', 'Attorney/Representative', 'IME-4 Count',
       'Accident Date_year', 'Accident Date_assembly_gap_days',
       'C3-C2_gap_days', 'C2_missing', 'C3_missing', 'C3_Accident_gap_weeks',
       'Hearing_C3 gap_months', 'Hearing_C2 gap_months', 'Days to Assembly',
       'Days to First Hearing', 'Average Weekly Wage_log',
       'Carrier Name_encoded', 'County of Injury_encoded',
       'Industry Code Description_encoded',
       'WCIO Cause of Injury Description_encoded',
       'WCIO Nature of Injury Description_encoded',
       'WCIO Part Of Body Description_encoded', 'Zip Code_encoded',
       'County of Worker_encoded', 'Carrier Name_freq',
       'County of Injury_freq', 'District Name_freq',
       'Industry Code Description_freq',
       'WCIO Cause of Injury Description_freq',
       'WCIO Nature of Injury Description_freq',
       'WCIO Part Of Body Description_freq', 'Zip Code_freq',
       'County of Worker_freq'],
      dtype='object')

In [41]:
test_processed_df.isna().sum()

Age at Injury                                0
Attorney/Representative                      0
IME-4 Count                                  0
Accident Date_year                           0
Accident Date_assembly_gap_days              0
C3-C2_gap_days                               0
C2_missing                                   0
C3_missing                                   0
C3_Accident_gap_weeks                        0
Hearing_C3 gap_months                        0
Hearing_C2 gap_months                        0
Days to Assembly                             0
Days to First Hearing                        0
Average Weekly Wage_log                      0
Carrier Name_encoded                         0
County of Injury_encoded                     0
Industry Code Description_encoded            0
WCIO Cause of Injury Description_encoded     0
WCIO Nature of Injury Description_encoded    0
WCIO Part Of Body Description_encoded        0
Zip Code_encoded                             0
County of Wor

In [42]:
test_preds = clf.predict(test_processed_df)


In [43]:
ordinalencoder = OrdinalEncoder(categories=[["1. CANCELLED", "2. NON-COMP", "3. MED ONLY", "4. TEMPORARY", "5. PPD SCH LOSS", "6. PPD NSL", "7. PTD", "8. DEATH"]])
ordinalencoder.categories
ordinalencoder.fit(df[["Claim Injury Type"]])


In [44]:
model_adjusted_preds = ordinalencoder.inverse_transform(test_preds.reshape(-1, 1))


In [45]:
sub_adjuted = pd.DataFrame(model_adjusted_preds, columns=["Claim Injury Type"], index=test["Claim Identifier"])

In [46]:
sub_adjuted.to_csv("submission.csv")