In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib


In [2]:
data = pd.read_csv('vct-data.csv')


In [3]:
def preprocess_data(data):
    # Group the data by matches (assuming each match has 10 rows)
    grouped = data.groupby(data.index // 10)
    
    processed_data = []
    for _, match in grouped:
        team_a = match.iloc[:5]
        team_b = match.iloc[5:]
        
        row = {
            'Map': match['Map'].iloc[0],
            'Outcome': 1 if team_a['Total Wins By Map'].iloc[0] > 0 else 0
        }
        
        for team, prefix in [(team_a, 'Team A '), (team_b, 'Team B ')]:
            for _, player in team.iterrows():
                row[f"{prefix}{player['Agent']}"] = 1
        
        processed_data.append(row)
    
    return pd.DataFrame(processed_data).fillna(0)



In [8]:
# Load data
data = pd.read_csv('vct-data.csv')
data_processed = preprocess_data(data)

# Split features and target
X = data_processed.drop('Outcome', axis=1)
y = data_processed['Outcome']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['Map'])
    ])

# Set output for consistent DataFrame
preprocessor.set_output(transform="pandas")

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Set output for consistent DataFrame
pipeline.set_output(transform="pandas")

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on the test set
y_pred = grid_search.predict(X_test)

# Save the model
joblib.dump(grid_search, 'best_model.pkl')

Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 300}
Best cross-validation score: 0.5346485411140585


['best_model.pkl']