In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mock-test-2-mse-2/sample_submission.csv
/kaggle/input/mock-test-2-mse-2/train.csv
/kaggle/input/mock-test-2-mse-2/test.csv


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from sklearn.pipeline import Pipeline

# ==================== Load Data ====================
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

In [12]:
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,1481.0,Placebo,12963.0,F,N,Y,N,N,2.4,346.0,3.34,70.0,1212.0,122.45,118.0,117.0,12.2,4.0,C
1,1,2580.0,D-penicillamine,20819.0,F,N,N,N,N,0.6,,4.52,51.0,645.0,74.4,,181.0,10.4,3.0,C
2,2,837.0,D-penicillamine,12307.0,F,N,Y,Y,N,6.1,586.0,2.73,89.0,2045.0,196.85,90.0,228.0,10.4,3.0,D
3,3,3021.0,,17532.0,F,,,,N,0.6,,3.34,,,,,388.0,10.4,3.0,C
4,4,1980.0,,23011.0,F,,,,N,0.8,,3.65,,,,,190.0,11.1,4.0,C


In [13]:
# ==================== Separate Target ====================
y = train["Status"]
X = train.drop(columns=["Status"])

# ==================== Handle Missing Values ====================
for col in X.columns:
    if X[col].dtype == 'object':  # categorical
        X[col] = X[col].fillna(X[col].mode()[0])
        test[col] = test[col].fillna(test[col].mode()[0])
    else:  # numeric
        X[col] = X[col].fillna(X[col].mean())
        test[col] = test[col].fillna(test[col].mean())

In [23]:
# # ==================== Label Encoding (before IsolationForest) ====================
# categorical_cols = X.select_dtypes(include=['object']).columns
# encoder = LabelEncoder()
# for col in categorical_cols:
#     X[col] = encoder.fit_transform(X[col])

# # ==================== Outlier Removal ====================
# iso = IsolationForest(contamination=0.03, random_state=42)
# outliers = iso.fit_predict(X)

# # Keep only non-outliers (1 means inliers, -1 means outliers)
# X = X[outliers == 1]
# y = y[outliers == 1]

# ==================== Train-Test Split ====================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [24]:
# ==================== Preprocessing ====================
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# ==================== Build Pipeline ====================
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [25]:
# ==================== Hyperparameter Tuning ====================
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
model.fit(X_train,y_train)

best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}


In [26]:
# ==================== Validation Performance ====================
y_val_pred = best_model.predict(X_val)
y_val_proba = best_model.predict_proba(X_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred, average='weighted'))
print("Recall:", recall_score(y_val, y_val_pred, average='weighted'))
print("F1 Score:", f1_score(y_val, y_val_pred, average='weighted'))
print("Log Loss:", log_loss(y_val, y_val_proba))

Accuracy: 0.8486666666666667
Precision: 0.8265965597848445
Recall: 0.8486666666666667
F1 Score: 0.8349788277231409
Log Loss: 0.39485172210433944


In [28]:
# ==================== Ensure same columns as training ====================
test_processed = test[X.columns]   # Align test with training features

# ==================== Predict for Submission ====================

test_proba = best_model.predict_proba(test_processed)

submission = pd.DataFrame({
    'id': test['id'],
    'Status_C': test_proba[:, list(best_model.classes_).index('C')],
    'Status_CL': test_proba[:, list(best_model.classes_).index('CL')],
    'Status_D': test_proba[:, list(best_model.classes_).index('D')]
})

submission.to_csv("Rafey.csv", index=False)
submission.head()



Unnamed: 0,id,Status_C,Status_CL,Status_D
0,15000,0.908151,0.000707,0.091142
1,15001,0.922292,0.001439,0.076269
2,15002,0.837214,0.05067,0.112116
3,15003,0.30645,0.066064,0.627485
4,15004,0.928375,0.008966,0.062658
