In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
df = pd.read_csv(r'C:\Users\Ismail\Desktop\GameProjects\train_data_1M.csv')

In [3]:
df.head(
)

Unnamed: 0,focus_switch_rate,idle_time_percent,main_keystrokes,main_mouse_events,helper_keystrokes,helper_mouse_events,entertain_keystrokes,entertain_mouse_events,status
0,16,0.333721,0.264676,0.26466,0.099751,0.206902,0.21875,0.566206,fragmented
1,10,0.085035,0.0,0.031504,0.783111,0.755183,0.16576,0.374156,active
2,17,0.07653,0.298036,0.306229,0.357597,0.525244,0.258461,0.297871,fragmented
3,9,0.326842,0.0,0.163057,0.0,0.004395,0.660024,0.473269,distracted
4,6,0.375505,0.743161,0.62427,0.097316,0.188874,0.156627,0.032622,productive


In [4]:
x = df.iloc[:, :8]
y = df['status'].astype('category').cat.codes #turn categorical data into integer numbers

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [6]:
xgb_train = xgb.DMatrix(x_train,label = y_train)
xgb_test = xgb.DMatrix(x_test,label = y_test)

In [7]:
number_of_classes = int(y_train.max()) + 1

# params = {
#     'max_depth': 7,
#     'eta': 0.07581258613808053, 
#     'objective': 'multi:softmax',
#     'min_child_weight': 4,
#     'subsample': 0.8473807747215681,
#     'colsample_bytree': 0.7169202844870651, 
#     'num_class': number_of_classes,
#     'eval_metric': 'merror',
#     'gamma':2.694748268157791,
# }
# Best Params: {'max_depth': 10, 'eta': 0.10028528441888343, 'subsample': 0.9174668482084811, 'colsample_bytree': 0.5319515187905608, 'min_child_weight': 10, 'gamma': 0.12754830973696762}

params = {
    'max_depth': 10,
    'eta': 0.10028528441888343, 
    'objective': 'multi:softmax',
    'min_child_weight': 10,
    'subsample': 0.9174668482084811,
    'colsample_bytree': 0.5319515187905608, 
    'num_class': number_of_classes,
    'eval_metric': 'merror',
    'gamma':0.12754830973696762,
}

In [8]:
watchlist = [(xgb_train, 'Train'), (xgb_test, 'Test')] #to monitor model preformance 

model = xgb.train(params,xgb_train,
                  num_boost_round=2000,
                  evals=watchlist,
                  early_stopping_rounds=50,
                  verbose_eval=10
                  )
#num_boost_round=100 tells XGBoost to build 100 decision trees in sequence

[0]	Train-merror:0.20953	Test-merror:0.21196
[10]	Train-merror:0.02157	Test-merror:0.02261
[20]	Train-merror:0.01817	Test-merror:0.01936
[30]	Train-merror:0.01762	Test-merror:0.01903
[40]	Train-merror:0.01679	Test-merror:0.01860
[50]	Train-merror:0.01637	Test-merror:0.01818
[60]	Train-merror:0.01596	Test-merror:0.01810
[70]	Train-merror:0.01559	Test-merror:0.01801
[80]	Train-merror:0.01535	Test-merror:0.01787
[90]	Train-merror:0.01510	Test-merror:0.01782
[100]	Train-merror:0.01489	Test-merror:0.01776
[110]	Train-merror:0.01468	Test-merror:0.01779
[120]	Train-merror:0.01445	Test-merror:0.01780
[130]	Train-merror:0.01433	Test-merror:0.01788
[140]	Train-merror:0.01416	Test-merror:0.01783
[144]	Train-merror:0.01408	Test-merror:0.01784


In [9]:
print(f"Training stopped at round: {model.best_iteration}")
print(f"Best Error Rate: {model.best_score}")

Training stopped at round: 94
Best Error Rate: 0.01774


In [10]:
pred = model.predict(xgb_test)

In [11]:
from sklearn.metrics import accuracy_score,classification_report
accuracy = accuracy_score(y_test,pred)
print(accuracy)
print(classification_report(y_test,pred))

0.982155
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     39999
           1       0.98      0.98      0.98     29976
           2       0.96      0.96      0.96     40063
           3       1.00      1.00      1.00     19986
           4       0.99      0.99      0.99     69976

    accuracy                           0.98    200000
   macro avg       0.98      0.98      0.98    200000
weighted avg       0.98      0.98      0.98    200000



In [12]:
label_names = df['status'].astype('category').cat.categories
predicted_indices = pred.astype(int)  
predicted_labels = [label_names[i] for i in predicted_indices]

import pandas as pd
results_df = pd.DataFrame({
    'Actual (Numeric)': y_test,
    'Predicted (Numeric)': predicted_indices,
    'Predicted (Text)': predicted_labels
})
print(results_df.head(15))

        Actual (Numeric)  Predicted (Numeric) Predicted (Text)
261045                 0                    0           active
432496                 0                    0           active
142251                 4                    4       productive
388312                 3                    3             idle
688013                 3                    3             idle
227313                 0                    0           active
853922                 4                    4       productive
334218                 0                    0           active
732450                 1                    1       distracted
516015                 4                    4       productive
281070                 0                    0           active
175239                 4                    4       productive
363992                 4                    4       productive
747379                 2                    2       fragmented
324739                 4                    4       pro

In [13]:
model.save_model('xgb_3.json')

In [14]:
# Best Params: {'max_depth': 7, 'eta': 0.07581258613808053, 'subsample': 0.8473807747215681, 'colsample_bytree': 0.7169202844870651, 'min_child_weight': 4, 'gamma': 2.694748268157791}