In [1]:
import pandas as pd
import numpy as np

USE_FEATURE_SUBSET = True

In [3]:
df = pd.read_csv('features_with_embeddings_custom_metric.csv')


# is a subset of features enough
if USE_FEATURE_SUBSET:
    subset = ['user_id', 'training_id', 'type', 'weekday','distance', 'duration', 'speed_std_30s', 'speed_mean_30s', 'hr_diff_to_overall_max', 'hr_std', 'hr_mean', 'ele_std'] + [str(i) for i in range(12)] + [c for c in df.columns if 'recent' in c]
    df = df[subset].copy()

labeled_data = df[~df['type'].isna()].copy()
unlabeled_data = df[df['type'].isna()].copy()

labeled_data.drop('training_id', axis=1, inplace=True)

X_labeled = labeled_data.drop('type', axis=1)
X_labeled.replace(np.nan, 0, inplace=True)

y_labeled = labeled_data['type']

X_unlabeled = unlabeled_data.drop('type', axis=1)
X_unlabeled = X_unlabeled.reset_index(drop=True)
X_unlabeled.replace(np.nan, 0, inplace=True)

unlabeled_ids = X_unlabeled['training_id']
X_unlabeled = X_unlabeled.drop('training_id', axis=1)

In [4]:

# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Placeholder for the data


# Base models for our stacked classifier
base_models = [
    ('xgboost', XGBClassifier()),
    ('lightgbm', LGBMClassifier()),
    ('gradient_boosting', GradientBoostingClassifier()),
    ('svm', SVC(probability=True)),
    ('knn', KNeighborsClassifier()),
    # ('ada', AdaBoostClassifier()),
    # ('random_forest', RandomForestClassifier())
]

final_estimator = XGBClassifier()

# Stacking the classifiers
stacked_classifier = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=5, stack_method='predict_proba')

# Training
k = 4  # Number of folds
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
fold_scores = []

for train_index, test_index in kfold.split(X_labeled, y_labeled):
    X_train, X_test = X_labeled.iloc[train_index], X_labeled.iloc[test_index]
    y_train, y_test = y_labeled.iloc[train_index], y_labeled.iloc[test_index]

    stacked_classifier.fit(X_train, y_train)
    y_pred = stacked_classifier.predict(X_test)
    fold_accuracy = accuracy_score(y_test, y_pred)
    fold_scores.append(fold_accuracy)
    print(f'Fold accuracy: {fold_accuracy}')

average_accuracy = np.mean(fold_scores)
print(f'Average accuracy across {k} folds: {average_accuracy}')



Fold accuracy: 0.8648782911077993


KeyboardInterrupt: 

In [5]:
# Predicting
stacked_classifier.fit(X_labeled, y_labeled)
y_unlabeled_pred = stacked_classifier.predict(X_unlabeled)
y_unlabeled_pred 

array(['LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'RACE', 'RACE',
       'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'INTERVAL', 'LONG JOG', 'STEADY JOG', 'LOW INTENSITY',
       'LOW INTENSITY', 'STEADY JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'INTERVAL', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'STEADY JOG', 'LONG JOG', 'LONG JOG', 'STEADY JOG', 'RACE', 'RACE',
       'LONG JOG', 'INTERVAL', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'LONG JOG', 'INTERVAL', 'LONG JOG', 'RACE', 'LONG JOG',
       'LONG JOG', 'LONG JOG', 'RACE', 'INTERVAL', 'LONG JOG',
       'STEADY JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'LONG JOG', 'RACE', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'RACE', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG', 'LONG JOG',
       'STEADY

In [7]:
df_exam = pd.read_csv('../exam_dataset.csv')


X_unlabeled['training_id'] = unlabeled_ids
X_unlabeled['type'] = y_unlabeled_pred
df_exam = df_exam.merge(X_unlabeled[['training_id', 'type']], on='training_id')
df_exam.drop('type_x', axis=1, inplace=True)
df_exam.rename(columns={'type_y': 'type'}, inplace=True)
df_exam

Unnamed: 0,training_id,user_id,type
0,00c64ed702bf204beb1208224cafd8a14fe51d1d968403...,6718565d7063a43aed889d287bbc1813afbf2119eaa4ca...,LONG JOG
1,02258687393f94100ef3a20c1bbbea3c30a01e79ddde6c...,eda0b1a85c94ba628c44bdefff5cf6941afac025eeee8f...,STEADY JOG
2,0230cad3ae88a7d8ac4915366fb9e4e817e1f98bab5205...,db889e243e8ca58f3afe4066c705e2f4f67ce65becf34a...,RACE
3,02cf5e1635db0873d51840170f469dfb2dc731806f05a2...,3f9ab926430606667ec6c7b43dc21f90286d44379bfe74...,LONG JOG
4,05d6ce94c401bfe4b89a2c2b13abce7817e5186cdc9525...,18988f7955b146b015d547d672a86e9223aafc657ceda0...,INTERVAL
...,...,...,...
245,f9e3b39eb14a30a76bccf40afba7e8e5034a409e9597a3...,f7d43e200372ece6f37e4935a6ba7649d4e795b51eb0ce...,LONG JOG
246,faade995b67a9766613301bdef63ebe28e7f94afe26a3c...,4066f060269c78cfcac71c7ad7db3c506ea8f36d9d28a5...,LONG JOG
247,faecd523f726ab5d4de51056443a20e8d92162aedd65f9...,64de340c14c989d38b59d3effaa49b7c4b14ef78a1d87b...,RACE
248,fc98b59aa5e94efdcd5f483db7c84407c2588994b96eaf...,b2abbc2f4dc772978f1124c8122a4d09e11097fb28c24a...,RACE


In [8]:
df_exam.to_csv('submission_custom_metric.csv', index=False)