In [None]:
!pip install pandas
!pip install scikit-learn

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('data/train.csv')
df.head(3)

In [None]:
df.dtypes

In [None]:
# remove nan values
# df_bs = df.dropna()

In [None]:
# convert each object column to a category
category_mapping = {}
categorised_df = df.copy(deep=False)
for col in categorised_df.select_dtypes(include='object').columns:
    categorised_df[col] = categorised_df[col].astype('category')
    category_mapping[col] = {value: i for i, value in enumerate(categorised_df[col].unique())}
    categorised_df[col] = categorised_df[col].map(category_mapping[col])
print(category_mapping)
categorised_df.head(2)


In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(categorised_df.drop(columns=['outcome'])), columns=categorised_df.drop(columns=['outcome']).columns)
y = categorised_df['outcome']
display(X.head(2))
display(y.head(2))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=50, random_state=0)
clf.fit(X_train, y_train)
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Test accuracy: {clf.score(X_test, y_test)}")

In [None]:
model = RandomForestClassifier()
hp = {'max_depth': [10, 50, 100, 200, 500, 1000, 2000], 'n_estimators': [10, 50, 100, 200, 400, 600, 1000]}
hp = {
    'n_estimators': [10, 50, 100, 200], 
    'max_depth': [None, 5, 10, 20], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False],
    # 'max_features': ['auto', 'sqrt']
}
random_rf = RandomizedSearchCV(estimator=model, param_distributions=hp,
                              n_iter=100, scoring='accuracy', cv=5,
                              refit=True, n_jobs=-1, verbose=1)
random_rf.fit(X_train, y_train)
print(f"Final Test accuracy: {random_rf.score(X_test, y_test)}")

In [None]:
# create predictions for the test set
test_df = pd.read_csv('data/test.csv')
for col in test_df.select_dtypes(include='object').columns:
    test_df[col] = test_df[col].map(category_mapping[col])
scaled_test = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
print("nan values in test set:", scaled_test.isna().sum().sum())
# print the nan values
scaled_test[scaled_test.isna().any(axis=1)]
# replace nan value with the mean of the column
scaled_test['pain'].fillna(scaled_test['pain'].mean(), inplace=True)
display(scaled_test.head(2))

predictions = random_rf.predict(scaled_test)
ids = test_df['id']
reverse_mapping = {v: k for k, v in category_mapping['outcome'].items()}
submission_df = pd.DataFrame({'id': ids, 'outcome': [reverse_mapping[prediction] for prediction in predictions]})
submission_df.head(10)


In [None]:
submission_df.to_csv('data/submission_doug.csv', index=False)

## Trying with XGBoost

In [None]:
!pip install xgboost

In [74]:
from xgboost import XGBRegressor, XGBClassifier
# Define the model
# model = XGBRegressor(n_estimators=1000, learning_rate=2e-2, early_stopping_rounds=5, random_state=0)
model = XGBClassifier(n_estimators=1000, learning_rate=2e-2, early_stopping_rounds=5, random_state=0)

# Fit the model
model.fit(X_train, y_train, 
             eval_set=[(X_test, y_test)], verbose=1)

[0]	validation_0-mlogloss:1.08876


[1]	validation_0-mlogloss:1.07936
[2]	validation_0-mlogloss:1.06997
[3]	validation_0-mlogloss:1.06112
[4]	validation_0-mlogloss:1.05245
[5]	validation_0-mlogloss:1.04404
[6]	validation_0-mlogloss:1.03619
[7]	validation_0-mlogloss:1.02841
[8]	validation_0-mlogloss:1.02076
[9]	validation_0-mlogloss:1.01349
[10]	validation_0-mlogloss:1.00663
[11]	validation_0-mlogloss:1.00021
[12]	validation_0-mlogloss:0.99371
[13]	validation_0-mlogloss:0.98756
[14]	validation_0-mlogloss:0.98160
[15]	validation_0-mlogloss:0.97565
[16]	validation_0-mlogloss:0.97013
[17]	validation_0-mlogloss:0.96470
[18]	validation_0-mlogloss:0.95926
[19]	validation_0-mlogloss:0.95374
[20]	validation_0-mlogloss:0.94871
[21]	validation_0-mlogloss:0.94382
[22]	validation_0-mlogloss:0.93907
[23]	validation_0-mlogloss:0.93444
[24]	validation_0-mlogloss:0.92998
[25]	validation_0-mlogloss:0.92550
[26]	validation_0-mlogloss:0.92127
[27]	validation_0-mlogloss:0.91673
[28]	validation_0-mlogloss:0.91240
[29]	validation_0-mlogloss:0.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[65]	validation_0-mlogloss:0.80688
[66]	validation_0-mlogloss:0.80537
[67]	validation_0-mlogloss:0.80376
[68]	validation_0-mlogloss:0.80180
[69]	validation_0-mlogloss:0.80015
[70]	validation_0-mlogloss:0.79850
[71]	validation_0-mlogloss:0.79718
[72]	validation_0-mlogloss:0.79589
[73]	validation_0-mlogloss:0.79459
[74]	validation_0-mlogloss:0.79332
[75]	validation_0-mlogloss:0.79205
[76]	validation_0-mlogloss:0.79116
[77]	validation_0-mlogloss:0.78987
[78]	validation_0-mlogloss:0.78857
[79]	validation_0-mlogloss:0.78705
[80]	validation_0-mlogloss:0.78553
[81]	validation_0-mlogloss:0.78392
[82]	validation_0-mlogloss:0.78274
[83]	validation_0-mlogloss:0.78144
[84]	validation_0-mlogloss:0.78036
[85]	validation_0-mlogloss:0.77931
[86]	validation_0-mlogloss:0.77807
[87]	validation_0-mlogloss:0.77699
[88]	validation_0-mlogloss:0.77611
[89]	validation_0-mlogloss:0.77519
[90]	validation_0-mlogloss:0.77429
[91]	validation_0-mlogloss:0.77326
[92]	validation_0-mlogloss:0.77248
[93]	validation_0-ml

In [78]:
predictions = model.predict(scaled_test)
ids = test_df['id']
reverse_mapping = {v: k for k, v in category_mapping['outcome'].items()}
submission_df = pd.DataFrame({'id': ids, 'outcome': [reverse_mapping[prediction] for prediction in predictions]})
display(submission_df.head(10))
submission_df.to_csv('data/submission_xgb_classifier.csv', index=False)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
5,1240,died
6,1241,died
7,1242,died
8,1243,lived
9,1244,lived
