## Importing datasets and libraries 

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.metrics import f1_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
print(train_data.shape)
test_data = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')
print(test_data.shape)
submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')
org_data = pd.read_csv('/kaggle/input/horse-survival-dataset/horse.csv')
#Combine original dataset and train dataset
train_data = pd.concat([train_data,org_data])
train_data.drop_duplicates(inplace=True)
print(train_data.shape)

In [None]:
train_data[['pain','outcome']].groupby(['pain'],as_index=False).count().sort_values(by='outcome', ascending=False)

In [None]:
train_data[['abdomen', 'outcome']].groupby(['abdomen'], as_index=False).count().sort_values(by='outcome', ascending=False)

In [None]:
train_data[['peristalsis', 'outcome']].groupby(['peristalsis'], as_index=False).count().sort_values(by='outcome', ascending=False)

# Preprocessing 


In [None]:
def ordinalEncoding(df,feature):
    for col in feature:
        val_counts = df[col].value_counts()
        value_counts_List = val_counts.index.tolist()
        value_counts_List_reverse = list(reversed(value_counts_List))
        order_encoder =OrdinalEncoder(categories=[value_counts_List_reverse])
        df[[col]] = order_encoder.fit_transform(df[[col]])
    return df

def process_lesion1(x):
    x = int(x)
    if len(str(x))==5:
        return 11
    elif x != 0:
        return float(str(x)[0:2])

def process_lesion2(x):
    x = int(x)
    if len(str(x))==5:
        return float(str(x)[2])
    elif x != 0:
        return float(str(x)[1])

def process_lesion3(x):
    x = int(x)
    if len(str(x))==5:
        return float(str(x)[3])
    elif x != 0 and len(str(x))==4:
        return float(str(x)[2])

def process_lesion4(x):
    x = int(x)
    if len(str(x))==5:
        return float(str(x)[4])
    elif x != 0 and len(str(x))==4:
        return float(str(x)[3])

def generateXY(all_data):
    x = all_data.copy()
    x = x.drop(columns="id")
    
    # Separate categorical and numerical features
    s = (x.dtypes == 'object')
    categorical_features = list(s[s].index)
    s = (x.dtypes != 'object')
    num_features = list(s[s].index)
     # Data preprocessing for categorical features
    
    x["pain"] = x["pain"].replace('slight', 'moderate') #'slight' has only one field 
    x["peristalsis"] = x["peristalsis"].replace('distend_small', 'normal') #'distend_small' has only one field
    x["rectal_exam_feces"] = x["rectal_exam_feces"].replace('serosanguious', 'absent')
    x["nasogastric_reflux"] = x["nasogastric_reflux"].replace('slight', 'none')  
    
    # Fill missing values in each column with most common value respectively and categorize symptoms as numerical data
    x["temp_of_extremities"] = x["temp_of_extremities"].fillna("normal").map({'cold': 0, 'cool': 1, 'normal': 2, 'warm': 3})
    x["peripheral_pulse"] = x["peripheral_pulse"].fillna("normal").map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
    x["capillary_refill_time"] = x["capillary_refill_time"].fillna("3").map({'less_3_sec': 0, '3': 1, 'more_3_sec': 2})
    x["pain"] = x["pain"].fillna("depressed").map({'alert': 0, 'depressed': 1, 'moderate': 2, 'mild_pain': 3, 'severe_pain': 4, 'extreme_pain': 5})
    x["peristalsis"] = x["peristalsis"].fillna("hypomotile").map({'hypermotile': 0, 'normal': 1, 'hypomotile': 2, 'absent': 3})
    x["abdominal_distention"] = x["abdominal_distention"].fillna("none").map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
    x["nasogastric_tube"] = x["nasogastric_tube"].fillna("none").map({'none': 0, 'slight': 1, 'significant': 2})
    x["nasogastric_reflux"] = x["nasogastric_reflux"].fillna("none").map({'less_1_liter': 0, 'none': 1, 'more_1_liter': 2})
    x["rectal_exam_feces"] = x["rectal_exam_feces"].fillna("absent").map({'absent': 0, 'decreased': 1, 'normal': 2, 'increased': 3})
    x["abdomen"] = x["abdomen"].fillna("distend_small").map({'normal': 0, 'other': 1, 'firm': 2,'distend_small': 3, 'distend_large': 4})
    x["abdomo_appearance"] = x["abdomo_appearance"].fillna("serosanguious").map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})
    
    
    # Imputes (or fills NaN values) with most frequent values
    categorical_imputer = SimpleImputer(strategy = 'most_frequent')
    x[categorical_features] = categorical_imputer.fit_transform(x[categorical_features])
    ordinalEncoding(x,categorical_features)
    # Imputes (or fills NaN values) with next 10 neighbouring values 
    num_imputer = KNNImputer(n_neighbors = 10)
    x[num_features] = num_imputer.fit_transform(x[num_features])

    x['lesion_type'] = x['lesion_1'].map(lambda x: process_lesion1(x))
    x['lesion_type2'] = x['lesion_1'].map(lambda x: process_lesion2(x))
    x['lesion_type3'] = x['lesion_1'].map(lambda x: process_lesion3(x))
    x['lesion_type4'] = x['lesion_1'].map(lambda x: process_lesion4(x))
    
#     categorical_features_to_encode = ["surgery", "age", "temp_of_extremities", "peripheral_pulse", "mucous_membrane", "abdomo_appearance"]

#     # Apply one-hot encoding to the specified features
#     x= pd.get_dummies(x, columns=categorical_features_to_encode)
    

#     for i in [i for i in x.columns]:
#         if x[i].nunique()>=12:
#             Q1 = x[i].quantile(0.20)
#             Q3 = x[i].quantile(0.80)
#             IQR = Q3 - Q1
#             x = x[x[i] <= (Q3+(1.5*IQR))]
#             x = x[x[i] >= (Q1-(1.5*IQR))]
#     x = x.reset_index(drop=True)
#     display(x.head())



    x['lesion_type'].fillna(x['lesion_type'].median(),inplace=True)
    x['lesion_type2'].fillna(x['lesion_type2'].median(),inplace=True)
    x['lesion_type3'].fillna(x['lesion_type3'].median(),inplace=True)
    x['lesion_type4'].fillna(x['lesion_type4'].median(),inplace=True)
    
    

    train = x[x.type==0]
    test = x[x.type==1]

    y = train.outcome
    
    x = train.drop(columns=['outcome','type'])
    testx = test.drop(columns=['outcome','type'])
    smote = SMOTE()
    x,y = smote.fit_resample(x,y)

    return x,y,testx


In [None]:
def calculateScore(x,y,xtest,fold):
    kf = StratifiedKFold(n_splits=fold, random_state=24, shuffle=True)
    score = 0
    for i, (train_index, test_index) in enumerate(kf.split(x,y)):
        X_train = x.iloc[train_index]
        y_train = y.iloc[train_index]
        X_val = x.iloc[test_index]
        y_val = y.iloc[test_index]
        y_train.reset_index(inplace=True,drop=True)
        y_val.reset_index(inplace=True,drop=True)    

        clf=LGBMClassifier(n_estimators=20)
        clf.fit(X_train,y_train)
        predX = clf.predict(X_train)
        predV = clf.predict(X_val)
        predA = clf.predict(x)
        predT = clf.predict(xtest)
        s1 = f1_score(y_train,predX,average="micro")
        s2 = f1_score(y_val,predV,average="micro")
        s3 = f1_score(y,predA,average="micro")

        dataT = pd.DataFrame(predT,columns=[i])
        dataA = pd.DataFrame(predA,columns=[i])
        if i == 0:
            oof_test = dataT
            oof_train = dataA
        else:
            oof_test = pd.concat([oof_test, dataT],axis=1)
            oof_train = pd.concat([oof_train, dataA],axis=1)
        score += s2
        print(f"{i} has F1 {s1} {s2} {s3}")


    final_pred = [0]*oof_test.shape[0]
    for i in range(oof_test.shape[0]):
        row = oof_test.iloc[i]
        final_pred[i] = int(row.mode()[0])

    train_pred = [0]*oof_train.shape[0]
    for i in range(oof_train.shape[0]):
        row = oof_train.iloc[i]
        train_pred[i] = int(row.mode()[0])

    s4 = f1_score(y,train_pred,average="micro")    
    print(f"Mean F1 = {score/fold}, oof = {s4}")

    return train_pred, final_pred, clf


In [None]:
train_data['type'] = 0
test_data['type'] = 1
all_data = pd.concat([train_data,test_data],axis=0)
# all_data = all_data.drop(columns=['age'])

all_data


#all_data.to_csv('all_data.csv',index=False)
all_data.head(10)

In [None]:
x,y,xtest = generateXY(all_data)

#train_pred, test_pred, clf = calculateScore(x,y,xtest,25)
 



train_pred, test_pred, clf = calculateScore(x,y,xtest,25)


# Feature Importance (Heatmap and Feature Significance Graph)



Certain features here show multi-collinearity. Some of these features should hence be removed.

In [None]:
features = x.columns
plt.figure(figsize=[12,10])
plt.title('Features Correlation-Plot')
sns.heatmap(x[features].corr(), vmin=-1, vmax=1, center=0) #, 
plt.show()

In [None]:
x,y,xtest = generateXY(all_data)
clf=LGBMClassifier(n_estimators=20)
clf.fit(x,y)
train_pred = clf.predict(x)
feature_imp = pd.Series(clf.feature_importances_,index=x.columns).sort_values(ascending=False)
f,ax = plt.subplots(figsize=(10,10))
ax = sns.barplot(x=feature_imp, y=feature_imp.index)
ax.set_title('Feature importance')
plt.show()

In [None]:
plt.figure(figsize=(25, 20))
correlation_matrix = x.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)


# Final Submission

In [None]:
submission['outcome'] = test_pred
submission['outcome'] = submission['outcome'].map({0:"euthanized",1:"died",2:"lived"})
submission.to_csv('submission.csv',index=False)
print("Your submission was successfully saved!")
submission.head()

Please leave a comment if this was of some use to you.

## SVM Model

In [None]:
SVM_model = SVC(probability=True).fit(x,y)

svm_param = {"C": [.01, .1, 1, 5, 10, 100],             
             "gamma": [.01, .1, 1, 5, 10, 100],
             "kernel": ["rbf"],
             "random_state": [1]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
SVM = SVM_model.fit(x,y)#.best_estimator_
pred = SVM.predict(xtest)

pred