In [None]:
from data import all_df, df1, df2, df3, df4, df5, features

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Example data loading and preprocessing
# Replace this with your actual data loading
df = pd.read_csv('./data/E2019-2020.csv')  # Assuming your data is in a CSV file

X = df[features.keys()]

X['DateTime'] = pd.to_datetime(X.Date + ' ' + X.Time, format='%d/%m/%Y %H:%M')

X.drop('Date', axis=1, inplace=True)
X.drop('Time', axis=1, inplace=True)

# Feature engineering
X['HTR'] = (X['DateTime'] - X['DateTime'].shift()).dt.days.fillna(0)
X['ATR'] = (X['DateTime'] - X['DateTime'].shift()).dt.days.fillna(0)

# Additional features based on historical performance
# Assuming you have columns 'HomeTeam', 'AwayTeam', 'FTR'
# Replace this with your actual feature engineering steps

# Handling categorical features
X = pd.get_dummies(X, columns=['HomeTeam', 'AwayTeam'])

# Splitting the data into training and testing sets
y = X['FTR']  # Assuming 'FTR' is your target variable
X = X.drop(['FTR', 'DateTime'], axis=1)  # Drop target and any non-feature columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model selection and hyperparameter tuning
param_grid = {
    'n_estimators': [1, 3, 5, 10, 15, 25, 50],
    'max_depth': [None, 2, 5, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Example bet simulation
# Assuming your odds data is available in the test set
X_test_df = pd.DataFrame(X_test, columns=X.columns)
X_test_df['pred'] = y_pred
X_test_df['valid'] = y_test

# Example bet calculation
X_test_df['pred_round'] = np.round(X_test_df['pred'])
X_test_df['bet_result'] = np.where(X_test_df['pred_round'] == X_test_df['valid'], 100, -100)

total_margin = X_test_df['bet_result'].sum()
total_won = X_test_df['bet_result'][X_test_df['bet_result'] > 0].sum()
total_spent = len(X_test_df) * 100
win_count = len(X_test_df[X_test_df['bet_result'] > 0])
lose_count = len(X_test_df[X_test_df['bet_result'] < 0])

print(f'Margin: {total_margin}, Won: {total_won}, Spent: {total_spent} ({win_count} wins, {lose_count} loses)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DateTime'] = pd.to_datetime(X.Date + ' ' + X.Time, format='%d/%m/%Y %H:%M')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('Date', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('Time', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http