In [1]:
import pandas as pd
import numpy as np
import joblib
import sys
sys.path.append('../')

from premier_league import constants
if constants.RUN_DATA_EXPECTATIONS:
    from expectations_helpers import (
        AutoGreatExpectations,
        view_full_suite,
        view_suite_summary,
        save_expectations,
        load_expectations,
        validate_data
    )
from premier_league import (
    preprocessing,
    preprocessing_helpers,
    training,
    evaluation,
    prediction,
    data_extraction,
    visualisations,
    s3_helpers,
    postgres
)

In [2]:
import importlib
importlib.reload(constants)


<module 'premier_league.constants' from '/Users/david@inawisdom.com/Documents/Training/premier_league/notebooks/../premier_league/constants.py'>

In [3]:
# Timestamp for current run
current_timestamp = s3_helpers.get_current_date_time()
current_timestamp

'2023-11-16 08:06:55'

In [4]:
df = s3_helpers.grab_data_s3(constants.TRAINING_DATA_LOCATION)
df = df[constants.COLUMNS_REQ]

In [5]:
df.head()

Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,1995-96,19/08/95,Aston Villa,Man United,3.0,1.0,H
1,1995-96,19/08/95,Blackburn,QPR,1.0,0.0,H
2,1995-96,19/08/95,Chelsea,Everton,0.0,0.0,D
3,1995-96,19/08/95,Liverpool,Sheffield Weds,1.0,0.0,H
4,1995-96,19/08/95,Man City,Tottenham,1.0,1.0,D


### Set up data validation

In [6]:
if constants.RUN_DATA_EXPECTATIONS:
    ge_class = AutoGreatExpectations(df)

In [7]:
if constants.RUN_DATA_EXPECTATIONS:
    ge_data = ge_class.generate_expectations(verbose=False)

In [8]:
if constants.RUN_DATA_EXPECTATIONS:
    view_suite_summary(ge_data)

### Validate data

In [9]:
if constants.RUN_DATA_EXPECTATIONS:
    save_expectations(ge_data, constants.exp_loc)

In [10]:
if constants.RUN_DATA_EXPECTATIONS:
    data_expectations = load_expectations(constants.exp_loc)

In [11]:
if constants.INITIAL_DATA_LOAD:
    full_data = data_extraction.load_all_data(
        constants.TRAINING_DATA_LOCATION,
        constants.COLUMNS_REQ
    )
else:
    full_data = s3_helpers.grab_data_s3(
        constants.TRAINING_DATA_LOCATION)


In [12]:
full_data = data_extraction.add_new_data(
    full_data, 
    constants.COLUMNS_REQ,
    constants.TRAINING_DATA_LOCATION
)

Data saved at app_data/training_data_full.csv


In [13]:
if constants.RUN_DATA_EXPECTATIONS:
    validation_results = validate_data(full_data, data_expectations)

### Preprocessing

In [14]:
df = s3_helpers.grab_data_s3(constants.TRAINING_DATA_LOCATION)
df = df[constants.COLUMNS_REQ]
df.head()

Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,1995-96,19/08/95,Aston Villa,Man United,3.0,1.0,H
1,1995-96,19/08/95,Blackburn,QPR,1.0,0.0,H
2,1995-96,19/08/95,Chelsea,Everton,0.0,0.0,D
3,1995-96,19/08/95,Liverpool,Sheffield Weds,1.0,0.0,H
4,1995-96,19/08/95,Man City,Tottenham,1.0,1.0,D


In [15]:
transformers = preprocessing.fit_transformers(
    df
)

In [16]:
s3_helpers.save_transformer_s3_pickle(
    transformers, 
    constants.TRANSFORMER_PATH
)

Transformer object is saved to S3 bucket premier-league-app at app_data/transformers/transformer_v2_20231116.pkl


In [17]:
transformed_data = preprocessing.transform_data(
    df, transformers
)

In [18]:
#transformed_data = transformed_data.drop(['HomeTeam', 'AwayTeam', 'season'], axis=1)

In [19]:
training_data, testing_data = preprocessing.split_data(transformed_data)

In [20]:
training_data.shape

(8608, 33)

In [21]:
testing_data.shape

(2152, 33)

In [22]:
training_data.head(2)

Unnamed: 0,season,HomeTeam,AwayTeam,FTR,HTGS,ATGS,HTGC,ATGC,HTP,ATP,HM1,AM1,HM2,AM2,HM3,AM3,HM4,AM4,HM5,AM5,MW,HTFormPts,ATFormPts,HTGD,ATGD,DiffPts,DiffFormPts,HomeLeaguePosition,AwayLeaguePosition,LeaguePositionDiff,MatchDayDay,MatchDayMonth,MatchDayDate
0,2000-01,Middlesbrough,Liverpool,0,20.0,37.0,28.0,24.0,0.75,1.65,D,W,W,W,L,L,L,W,D,L,20.0,5,9,-0.4,0.65,-0.9,-0.2,18,4,14,Tuesday,December,26
1,2022-23,West Ham,Man City,2,0.0,0.0,0.0,0.0,0.0,0.0,M,M,M,M,M,M,M,M,M,M,1.0,0,0,0.0,0.0,0.0,0.0,19,13,6,Sunday,August,7


### Model Training

In [23]:
target_column = 'FTR'

In [24]:
hyperparameters = training.optimise_hyperparameters(
    training_data,
    target_column,
    max_evals = constants.MAX_EVALS
)

Entering fmin
CatBoost initialized                                                                                                
Fitting model                                                                                                       
Loss: -0.5261324041811847                                                                                           
CatBoost initialized                                                                                                
Fitting model                                                                                                       
Loss: -0.5267131242740999                                                                                           
100%|██████████████████████████████████████████████| 2/2 [00:08<00:00,  4.31s/trial, best loss: -0.5267131242740999]


In [25]:
print(hyperparameters)

{'learning_rate': 0.1, 'iterations': 200, 'max_depth': 4}


In [34]:
importlib.reload(training)

(psycopg2.OperationalError) connection to server at "premier-league-logging.cmqq2c3bdwm0.eu-west-2.rds.amazonaws.com" (13.43.76.57), port 5432 failed: FATAL:  database "premier-league-logging" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Operation will be retried in 0.1 seconds
(psycopg2.OperationalError) connection to server at "premier-league-logging.cmqq2c3bdwm0.eu-west-2.rds.amazonaws.com" (13.43.76.57), port 5432 failed: FATAL:  database "premier-league-logging" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Operation will be retried in 0.3 seconds
(psycopg2.OperationalError) connection to server at "premier-league-logging.cmqq2c3bdwm0.eu-west-2.rds.amazonaws.com" (13.43.76.57), port 5432 failed: FATAL:  database "premier-league-logging" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)
Operation will be retried in 0.7 seconds
(psycopg2.OperationalError) connection to server at "premier-league

KeyboardInterrupt: 

In [32]:
importlib.reload(postgres)

<module 'premier_league.postgres' from '/Users/david@inawisdom.com/Documents/Training/premier_league/notebooks/../premier_league/postgres.py'>

In [30]:
# Start postgresDB for model logging
postgres.start_rds_instance(constants.POSTGRES_DB_ID)

Error starting RDS instance: An error occurred (InvalidDBInstanceState) when calling the StartDBInstance operation: Instance premier-league-logging cannot be started as it is not in one of the following statuses: 'stopped, inaccessible-encryption-credentials-recoverable, incompatible-network (only valid for non-SqlServer instances)'.


In [None]:
classifier = training.train_model(
    training_data[[col for col in training_data if col != target_column]],
    training_data[target_column],
    hyperparameters = hyperparameters
)

In [None]:
s3_helpers.save_transformer_s3_pickle(
    classifier, 
    constants.CLASS_MODEL_NAME,
    is_transformer=False
)

### Prediction

In [None]:
y_test = testing_data[target_column]
x_test = testing_data[[col for col in testing_data if col != target_column]]

In [None]:
predictions = prediction.predict(x_test, classifier)

### Evaluation

In [None]:
evaluation_metrics = evaluation.evaluate_model(predictions, y_test)

In [None]:
evaluation_metrics

### Part 2: Predict Score

In [None]:
transformed_data = prediction.add_match_result(
    transformed_data, classifier, df )

In [None]:
s3_helpers.save_data_s3(
    transformed_data,
    constants.TRANSFORMED_DATA_LOCATION
)

In [None]:
transformed_data.shape

In [None]:
training_data, testing_data = preprocessing.split_data(transformed_data)

In [None]:
hyperparameters = training.optimise_hyperparameters(
    training_data.drop(['FTR','FTAG'], axis=1),
    'FTHG',
    classification=False,
    max_evals=constants.MAX_EVALS
)

In [None]:
regressor_1 = training.train_model(
    training_data.drop(['FTR', 'FTHG','FTAG'], axis=1),
    training_data['FTHG'],
    model_type='home',
    verbose=False,
    hyperparameters = hyperparameters
)

In [None]:
s3_helpers.save_transformer_s3_pickle(
    regressor_1, 
    constants.HOME_MODEL_NAME,
    is_transformer=False
)

In [None]:
hyperparameters = training.optimise_hyperparameters(
    training_data.drop(['FTR', 'FTHG'], axis=1),
    'FTAG',
    classification=False,
    max_evals=constants.MAX_EVALS
)

In [None]:
regressor_2 = training.train_model(
    training_data.drop(['FTR', 'FTHG','FTAG'], axis=1),
    training_data['FTAG'],
    model_type='away',
    verbose=False,
    hyperparameters = hyperparameters
)

In [None]:
s3_helpers.save_transformer_s3_pickle(
    regressor_2, 
    constants.AWAY_MODEL_NAME,
    is_transformer=False
)

### Predict Home Goals

In [None]:
y_test = testing_data['FTHG']
x_test = testing_data.copy()

In [None]:
predictions_1 = prediction.predict(x_test, regressor_1)

In [None]:
evaluation_metrics = evaluation.evaluate_model(predictions_1, 
                                               y_test, classification=False)
evaluation_metrics

### Predict Away Goals

In [None]:
y_test = testing_data['FTAG']
x_test = testing_data.copy()

In [None]:
predictions_2 = prediction.predict(x_test, classifier)

In [None]:
evaluation_metrics = evaluation.evaluate_model(predictions_2, 
                                               y_test, classification=False)
evaluation_metrics

### Visualisations of results

In [None]:
x_test['Home Prediction'] = predictions_1
x_test['Away Prediction'] = predictions_2

In [None]:
visualisations.histoplot(x_test, ha='Home')

In [None]:
visualisations.histoplot(x_test, ha='Away')

In [None]:
visualisations.actuals_predicted(x_test, ha='Home')

In [None]:
visualisations.actuals_predicted(x_test, ha='Away')

In [None]:
visualisations.plot_features(regressor_1, 'Home',n=20)

In [None]:
visualisations.plot_features(regressor_2, 'Away')