## Training of 2nd XGBoost model



# Merging all data for training the 2nd XGBoost model

In [None]:
import csv
# Generate filenames programmatically
# Getting XGBoost training data for each simualtion (1000) then merge them into one file, make sure to use correct filepath
files = ["filepath/xxx_{}.csv".format(i) for i in range(1000)]

# Name of the merged file e.g. merged_result.csv
merged_filename = "merged_result.csv"

with open(merged_filename, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    
    for index, filename in enumerate(files):
        with open(filename, 'r') as infile:
            reader = csv.reader(infile)
            
            # Skip header only if it's not the first file
            if index != 0:
                next(reader, None)
                
            # Write rows from current file to the output file
            writer.writerows(reader)

# Sorting/Ranking

In [1]:
import pandas as pd

# Read the merged CSV file into a DataFrame
df = pd.read_csv("merged_result.csv")

# Sort the DataFrame by balance in descending order
df_sorted = df.sort_values(by="balance", ascending=False)

df_sorted.head(100)

ModuleNotFoundError: No module named 'pandas'

# Top 20% of data


In [None]:
# Calculate 20% of the total number of rows and retrieve the top rows
top_20_percent = int(0.20 * len(df_sorted))
top_20_df = df_sorted.head(top_20_percent)

# Save the top 20% rows to a new CSV file, if desired
top_20_df.to_csv("top_20_percent.csv", index=False)

# Preparing data

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Import the data
data = pd.read_csv("top_20_percent.csv")
decision_counts = data["decision"].value_counts()
print(decision_counts)
# 2. Preprocessing

# Remove potentially irrelevant columns
data = data.drop(
    columns=["type", "competitorID", "exchange", "agentID", "balance", "odds"]
)

# Column that has left: time  odds  stake  distance  rank

# Convert 'decision' column to binary: 1 for 'backer' and 0 for 'layer'
data["decision"] = data["decision"].map({"backer": 1, "layer": 0})

# Split data into training and testing sets
X = data.drop(columns="decision")
print(X.head())
y = data["decision"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Training

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

param_grid = {
    "eta": [0.01, 0.1, 0.3],  # Default: 0.3
    "max_depth": [4, 5, 6],  # Default: 6
    "subsample": [0.5, 0.75, 1.0],  # Default: 1
    "colsample_bytree": [0.5, 0.75, 1.0],  # Default: 1
    "gamma": [0, 0.1, 0.2],  # Default: 0
}
# Convert the booster to scikit-learn format
xgb_estimator = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_estimators=100,
)
# Set up grid search
grid_search = GridSearchCV(
    estimator=xgb_estimator,
    param_grid=param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=2,
    verbose=2,
)
# Fit the model
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Use the best model for prediction and evaluation
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy (Best Model):", accuracy_best)
print(classification_report(y_test, y_pred_best))