# Soccer Prediction Model

#### Importing needed libs and setting correct system paths

In [1]:
import sys
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.cm as cm
from sklearn.metrics import f1_score 
# Show matplotlib plots inline (nicely formatted in the notebook)
%matplotlib inline

os.chdir('/Users/senzari/Machine_Learning/stats/src')
#print(os.getcwd())

from stats import form_data, match_stats, model_libs, form_model, predict_matches

# Variables
round_number = 27 # for MLS only
ignore_cols = ['match_id', 'team_id', 'team_name', 'opp_id', 'opp_name', 'scheduled', 'games_played', 'round', 'current_formation']
all_models = ['knn', 'log', 'svc', 'gnb', 'randomForest']
leagues = model_libs.get_leagues_country_codes() # = { "epl": 'ENG' }
teams = form_data.get_teams()
league_rounds = model_libs.get_leagues_rounds()

""" Change depending on which model you want to run"""
target = "converted_goals" # converted_goals or points

""" this variable 'testing' should be False if using CSV's and not pulling from the database. 
    - Testers won't have database.
    - Otherwise will build all data from scratch... takes a long time """
testing = True

INITIALIZED...


#### Pulling in the data either from the Database or the CSV (CSV for testers). Not really needed for testers...

In [None]:
data_csv = 'raw_data_10_12_16.csv'
testing = False
if testing:
    raw_data = form_data.run_data()
    raw_data.to_csv(data_csv)
    print("Raw Data Saved to CSV")
else:
    #Reading in a CSV adds the first index column
    raw_data = pd.read_csv(data_csv)
    raw_data = raw_data.drop(raw_data.columns[[0]], axis=1)

pd.set_option("display.max_columns", 85)
print('Data Loaded...')
print("Dataset size :: {}".format(raw_data.shape))
display(raw_data.head())

## FORMATTING

#### Implementing the quartiles rankings and outputs to ranked_data.csv

In [2]:
# Helper Function - Removes Columns to Ignore and Splits the Target Column
def split_target(data):
    td = model_libs._clone_and_drop(data, ignore_cols)
    (y, X) = model_libs._extract_target(td, target_col)
    return X, y
testing = False
""" Need to do some formatting of the Data before we run the models"""
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

if testing:
    ranked_data = form_data.get_rankings(leagues, teams, league_rounds, raw_data, False)
    ranked_data.to_csv('ranked_data_10_12_16.csv')          
else:
    ranked_data = pd.read_csv('ranked_data_10_12_16.csv')
    ranked_data = ranked_data.drop(ranked_data.columns[[0]], axis=1)

print('Ranked Data Loaded...')

Ranked Data Loaded...


### BUILDING CLASSIFICATION MODEL

- If using 'converted_goals' as target converts the goals to binary ( 0-1 or 2+ goals)
- Also the area where we transform the data as desired whether using ratios or the numbers themselves
- Then we remove the unwanted data from our DF
- Build a simple model to confirm it's working and then check the accuracy of that model

In [3]:
def run_features(data, drop_data, target, models):
    
    new_data = data.drop(drop_data, axis=1)
    #display(new_data.head())
    (y, X) = model_libs._extract_target(new_data, target)
    models = form_model.train_models(round_number, X, y, models)
    return models

inds = pd.isnull(ranked_data).any(1).nonzero()[0]
print(inds)

""" Formatting data to convert goals scored to the correct category"""
formatted_data = ranked_data.copy()

if target == "converted_goals":
    # Not using points as a target for this version, using goals
    formatted_data['converted_goals'] = formatted_data.apply(lambda row: model_libs.set_group(row['goals']), axis=1)
    formatted_data = formatted_data.drop(['points', 'goals'], 1)
else: 
    formatted_data = formatted_data.drop(['goals'], 1)

    """ This is where you manipulate the features as desired """
""" //////////////////////////////////////////////////////////////////////////////////////////////////// """
""" Using diff_squared methods for features """
""" //////////////////////////////////////////////////////////////////////////////////////////////////// """
formatted_data["diff_goals_for"] = formatted_data.apply(lambda row: model_libs.diff_square(row["goals_for"], row["opp_goals_for"]), axis=1)
formatted_data["diff_goals_allowed"] = formatted_data.apply(lambda row: model_libs.diff_square(row["goals_against"], row["opp_goals_against"]), axis=1)
formatted_data["diff_attacks"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_attacks"], row["opp_team_attacks"]), axis=1)   
formatted_data["diff_dangerous_attacks"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_dangerous_attacks"], row["opp_team_dangerous_attacks"]), axis=1)   
formatted_data["diff_goal_attempts"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_goal_attempts"], row["opp_team_goal_attempts"]), axis=1)
formatted_data["diff_ball_safe"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_ball_safe"], row["opp_team_ball_safe"]), axis=1)                                                                                                                                                           
formatted_data["diff_possession"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_possession"], row["opp_team_possession"]), axis=1)
#formatted_data["diff_corner_kicks"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_corner_kicks"], row["opp_team_corner_kicks"]), axis=1)                                                                                                                                                           
#formatted_data["diff_goal_kicks"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_goal_kicks"], row["opp_team_goal_kicks"]), axis=1)
#formatted_data["diff_saves"] = formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_saves"], row["opp_team_saves"]), axis=1)

columns_to_drop = ['current_record', 'opp_record', 'goals_for', 'opp_goals_for', 'goals_against', 'opp_goals_against', 'rpi']

_, stats = form_data.get_columns()

formatted_data = formatted_data.drop(ignore_cols + columns_to_drop + stats, 1)

#### Running ALL Features 
if target == "converted_goals":
    models_test_1 = run_features(formatted_data, [], 'converted_goals', ["knn"])
    (formatted_y, formatted_X) = model_libs._extract_target(formatted_data, 'converted_goals')
else:
    models_test_1 = run_features(formatted_data, [], 'points', ["knn"])
    (formatted_y, formatted_X) = model_libs._extract_target(formatted_data, 'points')
    
print(formatted_X.columns)

# Simple Function to check the accuracy of the models.  Not the Final function
def check_accuracy(model, data_X, y):
    actual_y = pd.DataFrame(y.values, columns=['actual'])
    predictions = pd.concat([pd.DataFrame(model.predict(data_X), columns=['predictions']), actual_y], axis=1)
    predictions['accuracy'] = predictions.apply(lambda r: model_libs.predictions_diff(r['predictions'], r['actual']), axis=1)
    accuracy = np.divide(predictions['accuracy'].sum(), float(len(predictions['accuracy'])))
    print(accuracy)

for m in models_test_1:
    check_accuracy(m, formatted_X, formatted_y)

[]
-----------------------------------
Training K Neighbors Classifier Model
KNN Score on Training Set :: 0.7572649572649572
KNN Score on Test Set:: 0.6258503401360545
Finished K-Means Modeling
Index(['is_home', 'offensive_ranking', 'opp_defensive_ranking', 'rpi_ranking',
       'opp_rpi_ranking', 'diff_goals_for', 'diff_goals_allowed',
       'diff_attacks', 'diff_dangerous_attacks', 'diff_goal_attempts',
       'diff_ball_safe', 'diff_possession'],
      dtype='object')
0.73087431694


### Run, tune, and save all models 

In [4]:
model_results = []
for m in all_models:
    r = form_model.build_tuned_model(formatted_X, formatted_y, m)
    model_results.append(r)
    print('Accuracy :: ')
    check_accuracy(r[0], formatted_X, formatted_y)

-----------------------------------
Training K-Means Model
KNN Score :: 0.7623931623931623 for Training
KNN Score :: 0.5510204081632653 for Testing
KNN Score :: 0.7675213675213676 for Training
KNN Score :: 0.5782312925170068 for Testing
KNN Score :: 0.7794871794871795 for Training
KNN Score :: 0.6190476190476191 for Testing
KNN Score :: 0.7440273037542662 for Training
KNN Score :: 0.6438356164383562 for Testing
KNN Score :: 0.768313458262351 for Training
KNN Score :: 0.5793103448275863 for Testing
Finished K-Means Modeling
Accuracy :: 
0.73087431694
Training LOG REG Model
Score on Training Set :: 0.582905982905983
Score on Test Set :: 0.6190476190476191
Finished LOG REG Modeling
Accuracy :: 
0.590163934426
Training and Tuning SVC Model
[ 0.59385666  0.59589041]
Accuracy: 0.59 (+/- 0.00)
Finished SVC Modeling
Accuracy :: 
0.91393442623
-----------------------------------
Training Gaussian NB Model
[ 0.62711864  0.65254237  0.63247863  0.62931034  0.56034483]
Accuracy: 0.62 (+/- 0.06)
Fi

### Models have been saved so let's import them into one object.

In [5]:
prediction_models = form_model.load_models(all_models)

Success :: Loaded - knn
Success :: Loaded - log
Success :: Loaded - svc
Success :: Loaded - gnb
Success :: Loaded - randomForest


### Pulling the upcoming matches for the week and then ranking the quartiles like we did on the previous data

In [6]:
print('Upcoming matches')
#upcoming_matches, match_details = predict_matches.get_upcoming_matches()
#upcoming_matches.to_csv('upcoming_matches_10_12_16.csv')
upcoming_matches = pd.read_csv('upcoming_matches_10_12_16.csv')
upcoming_matches = upcoming_matches.drop(upcoming_matches.columns[[0]], axis=1)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
upcoming_data = predict_matches.predictions(upcoming_matches)
display(upcoming_data.head())
testing = False
if testing:
    upcoming_ranked_data = form_data.get_rankings(leagues, teams, league_rounds, upcoming_data, True)
    upcoming_ranked_data.to_csv('upcoming_ranked_data_10_12_16.csv')               
else:
    upcoming_ranked_data = pd.read_csv('upcoming_ranked_data_10_12_16.csv')
    upcoming_ranked_data = upcoming_ranked_data.drop(upcoming_ranked_data.columns[[0]], axis=1)

print('Loaded Upcoming Data...')
#display(upcoming_ranked_data.head())

Upcoming matches


Unnamed: 0,match_id,team_id,team_name,opp_id,opp_name,scheduled,round,games_played,is_home,current_formation,current_record,opp_record,goals_for,opp_goals_for,goals_against,opp_goals_against,rpi,goals,points,current_team_possession,current_team_yellow_cards,current_team_goal_attempts,current_team_dangerous_attacks,current_team_sec_half_goals,current_team_saves,current_team_corner_kicks,current_team_ball_safe,current_team_first_half_goals,current_team_shots_on_target,current_team_attacks,current_team_goal_attempts_allowed,current_team_goal_kicks,current_team_shots_total,opp_team_possession,opp_team_yellow_cards,opp_team_goal_attempts,opp_team_dangerous_attacks,opp_team_sec_half_goals,opp_team_saves,opp_team_corner_kicks,opp_team_ball_safe,opp_team_first_half_goals,opp_team_shots_on_target,opp_team_attacks,opp_team_goal_attempts_allowed,opp_team_goal_kicks,opp_team_shots_total
0,79.0,41,Southampton FC,53.0,Burnley FC,2016-10-16 15:00:00,8,3.0,1,4-3-3,0.833333,0.285714,4,2,0,4,0.595238,0,0,173.0,3.0,36.0,147.0,3,5.0,15.0,309.0,1,20.0,339.0,19.0,30.0,36.0,113.0,,19.0,89.0,1,7.0,10.0,297.0,1,8.0,293.0,26.0,28.0,19.0
1,72.0,42,Manchester City,44.0,Everton FC,2016-10-15 14:00:00,8,3.0,1,4-3-3,0.714286,0.5,7,4,3,3,0.763799,0,0,172.0,7.0,37.0,152.0,4,13.0,15.0,286.0,3,23.0,310.0,28.0,19.0,37.0,157.0,9.0,25.0,136.0,0,2.0,19.0,293.0,4,11.0,312.0,15.0,17.0,25.0
2,75.0,43,Arsenal FC,45.0,Swansea City,2016-10-15 14:00:00,8,3.0,1,4-2-3-1,1.0,0.0,8,2,1,6,0.565476,0,0,179.0,,33.0,174.0,4,4.0,19.0,274.0,4,16.0,322.0,17.0,16.0,33.0,112.0,5.0,26.0,109.0,0,8.0,6.0,280.0,2,11.0,261.0,34.0,35.0,26.0
3,72.0,44,Everton FC,42.0,Manchester City,2016-10-15 14:00:00,8,3.0,0,4-3-3,0.5,0.714286,4,7,3,3,0.625,0,0,157.0,9.0,25.0,136.0,0,2.0,19.0,293.0,4,11.0,312.0,15.0,17.0,25.0,172.0,7.0,37.0,152.0,4,13.0,15.0,286.0,3,23.0,310.0,28.0,19.0,37.0
4,75.0,45,Swansea City,43.0,Arsenal FC,2016-10-15 14:00:00,8,3.0,0,4-2-3-1,0.0,1.0,2,8,6,1,0.53125,0,0,112.0,5.0,26.0,109.0,0,8.0,6.0,280.0,2,11.0,261.0,34.0,35.0,26.0,179.0,,33.0,174.0,4,4.0,19.0,274.0,4,16.0,322.0,17.0,16.0,33.0


Loaded Upcoming Data...


### Formatting the features as need and as we did in the previous features of the build

In [7]:
""" Formatting data to convert goals scored to the correct category"""
upcoming_formatted_data = upcoming_ranked_data.copy()

""" //////////////////////////////////////////////////////////////////////////////////////////////////// """
""" Using diff_squared methods for features """
""" //////////////////////////////////////////////////////////////////////////////////////////////////// """
upcoming_formatted_data["diff_goals_for"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["goals_for"], row["opp_goals_for"]), axis=1)
upcoming_formatted_data["diff_goals_allowed"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["goals_against"], row["opp_goals_against"]), axis=1)
upcoming_formatted_data["diff_attacks"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_attacks"], row["opp_team_attacks"]), axis=1)   
upcoming_formatted_data["diff_dangerous_attacks"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_dangerous_attacks"], row["opp_team_dangerous_attacks"]), axis=1)   
upcoming_formatted_data["diff_goal_attempts"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_goal_attempts"], row["opp_team_goal_attempts"]), axis=1)
upcoming_formatted_data["diff_ball_safe"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_ball_safe"], row["opp_team_ball_safe"]), axis=1)                                                                                                                                                           
upcoming_formatted_data["diff_possession"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_possession"], row["opp_team_possession"]), axis=1)
#upcoming_formatted_data["diff_corner_kicks"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_corner_kicks"], row["opp_team_corner_kicks"]), axis=1)                                                                                                                                                           
#upcoming_formatted_data["diff_goal_kicks"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_goal_kicks"], row["opp_team_goal_kicks"]), axis=1)
#upcoming_formatted_data["diff_saves"] = upcoming_formatted_data.apply(lambda row: model_libs.diff_square(row["current_team_saves"], row["opp_team_saves"]), axis=1)

columns_to_drop = ['current_record', 'opp_record', 'goals_for', 'opp_goals_for', 'goals_against', 'opp_goals_against', 'rpi']
_, stats = form_data.get_columns()

upcoming_formatted_data = upcoming_formatted_data.drop(ignore_cols + columns_to_drop + stats + ['points', 'goals'], 1)
print(upcoming_formatted_data.columns)

Index(['is_home', 'offensive_ranking', 'opp_defensive_ranking', 'rpi_ranking',
       'opp_rpi_ranking', 'diff_goals_for', 'diff_goals_allowed',
       'diff_attacks', 'diff_dangerous_attacks', 'diff_goal_attempts',
       'diff_ball_safe', 'diff_possession'],
      dtype='object')


### Using the prediction_models object to predict the upcoming matches

In [8]:
""" Models we'll use to predict on upcoming matches """

# This is all the X values
upcoming_formatted_data

inds = pd.isnull(upcoming_formatted_data).any(1).nonzero()[0]
print(inds)

rf_preds = prediction_models[4].predict(upcoming_formatted_data)
print(rf_preds)

knn_preds = prediction_models[0].predict(upcoming_formatted_data)
print(knn_preds)

svc_preds = prediction_models[2].predict(upcoming_formatted_data)
print(svc_preds)

log_preds = prediction_models[1].predict(upcoming_formatted_data)
print(log_preds)

log_prob = prediction_models[1].predict_proba(upcoming_formatted_data)
probs = pd.DataFrame(log_prob)
probs = probs.rename(columns = {0:'Probability 0', 1: 'Probability 1', 2: 'Probability 2'})
#display(probs)
#probs.to_csv('probs.csv')

gnb_preds = prediction_models[3].predict(upcoming_formatted_data)
print(gnb_preds)


[]
[1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 1 0 1 1 0
 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0
 1 1 1 0]
[0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]
[1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0
 1 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0
 0 1 1 0]
[1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0
 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0
 1 1 1 1]


## Verifying the results

 - Takes the upcoming matches and adds the previous predictions to that DF.  
 - Also adds a random series to that DF as a baseline
 - Since the upcoming_matches DF is ordered by Team Id, we reorder it so that the Current Team row and the Opponent Team Row are next to each other for easier visualisation.
 - Adds the actual_results dataframe so we can compare the results side by side
 - Spits the results out into a CSV for visualisation

In [9]:
columns = ['team_name', 'opp_name', 'scheduled', 'is_home']
# Remove all columns except the ones above
upcoming_matches = upcoming_data[columns]

if target == 'converted_goals':
    random_preds = pd.Series(np.random.randint(2, size=len(upcoming_matches.index)), upcoming_matches.index)
else:
    random_preds = pd.Series(np.random.randint(3, size=len(upcoming_matches.index)), upcoming_matches.index)
    random_preds[random_preds == 2] = 3

# Add predictions to the end of that DF
results = pd.DataFrame({'KNN': knn_preds, 'RandomForest': rf_preds, 'SVC': svc_preds, 'GNB': gnb_preds, 'log': log_preds, 'random': random_preds})
upcoming_matches = pd.concat([upcoming_matches, results, probs], axis = 1)
reordered_matches = pd.DataFrame([])

for rows in upcoming_matches.iterrows():
    for i in upcoming_matches['team_name']:
        if rows[1]['opp_name'] == i:
            reordered_matches = reordered_matches.append(rows[1])
            reordered_matches = reordered_matches.append(upcoming_matches[upcoming_matches['team_name'].isin([i])])

reordered_matches = reordered_matches.drop_duplicates() 
if target == 'points':
    columns = ['scheduled', 'team_name', 'opp_name', 'is_home', 'KNN', 'RandomForest', 'SVC', 'GNB', 'log', 'random', 'Probability 0', 'Probability 1', 'Probability 2']
else:
    columns = ['scheduled', 'team_name', 'opp_name', 'is_home', 'KNN', 'RandomForest', 'SVC', 'GNB', 'log', 'random', 'Probability 0', 'Probability 1']

reordered_matches = reordered_matches[columns]
""" To compare when we have actual results"""
#actual_results = pd.read_csv('actual_results.csv')
#actual_results = actual_results.rename(columns = {'Unnamed: 0':'idx'})
#indexed_results = actual_results.set_index('idx')
#reordered_matches = pd.concat([reordered_matches, indexed_results], axis=1)

reordered_matches = reordered_matches.reset_index(drop=True)
display(reordered_matches.head(5))
reordered_matches.to_csv('predictions_10_12_16_converted_goals.csv')
print('Prediction CSV saved')

Unnamed: 0,scheduled,team_name,opp_name,is_home,KNN,RandomForest,SVC,GNB,log,random,Probability 0,Probability 1
0,2016-10-16 15:00:00,Southampton FC,Burnley FC,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.27668,0.72332
1,2016-10-16 15:00:00,Burnley FC,Southampton FC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.834455,0.165545
2,2016-10-15 14:00:00,Manchester City,Everton FC,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.378294,0.621706
3,2016-10-15 14:00:00,Everton FC,Manchester City,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.692062,0.307938
4,2016-10-15 14:00:00,Arsenal FC,Swansea City,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.173981,0.826019


Prediction CSV saved


## F1 Score

In [None]:
from sklearn.metrics import confusion_matrix
column_model = "GNB"  #'KNN', 'RandomForest', 'SVC', 'GNB', 'log', 'random'

y_pred = reordered_matches[column_model]

if target == 'converted_goals':
    y_true = reordered_matches["actual_converted_goals"]
    display(reordered_matches[['actual_converted_goals', column_model]].head(10))
else:
    y_true = reordered_matches["actual"]
    display(reordered_matches[['actual', column_model]].head(10))


print(f1_score(y_true, y_pred, average='weighted'))
#import seaborn as sns
cnf_matrix = confusion_matrix(y_true, y_pred)
#_ = sns.heatmap(cnf_matrix, annot=True, cmap='YlGnBu')
print(cnf_matrix)

#### For converted_goals just checks to see the accuracy of the binary classifier.  For the points it gives more detail on the result.  Just need to change the column model to see which classifier you want to see the results for

In [None]:
if target == 'converted_goals':
    actual = "actual_converted_goals"
    reordered_matches = pd.read_csv('predictions_converted_goals.csv')
    reordered_matches = reordered_matches.drop(reordered_matches.columns[[0]], axis=1)
    reordered_matches['accuracy'] = reordered_matches.apply(lambda r: model_libs.predictions_diff(r[actual], r[column_model]), axis=1)
    accuracy = np.divide(reordered_matches['accuracy'].sum(), float(len(reordered_matches['accuracy'])))
    #display(reordered_matches[[column_model, actual]])
    print(accuracy)
else:
    actual = "actual"
    results_data = reordered_matches.copy()
    home_actual_win = 0
    home_predicted_win = 0
    predicted_draws = 0
    actual_draws = 0
    valid_matches = 0
    total_matches = 0
    invalid_predictions = []
    correct_predictions = 0

    for r, rows in results_data.iterrows():
        if r % 2 == 0:
            total_matches += 1
            predictions = results_data.loc[r:r+1, ["is_home", column_model, actual]]
            home_team = predictions[predictions["is_home"] == 1]
            #print(predictions)
            if home_team.iloc[0]["actual"] == 3:
                home_actual_win += 1
            elif home_team.iloc[0]["actual"] == 1:
                actual_draws += 1

            # Check if it's a valid prediction (0-3, 3-0, 1-1)
            is_valid = False
            if ((predictions.iloc[0][column_model] == 1) & (predictions.iloc[1][column_model] == 1)) or ((predictions.iloc[0][column_model] == 3) & (predictions.iloc[1][column_model] == 0)) or ((predictions.iloc[0][column_model] == 0) & (predictions.iloc[1][column_model] == 3)):
                is_valid = True
                valid_matches += 1

                if home_team.iloc[0][column_model] == 3:
                    home_predicted_win += 1

                    if home_team.iloc[0]["actual"] == 3:
                        correct_predictions += 1

                if (predictions.iloc[0][column_model] == 1) & (predictions.iloc[1][column_model] == 1):
                    predicted_draws += 1

                    if home_team.iloc[0]["actual"] == 1:
                        correct_predictions += 1
            else:
               invalid_predictions.append(predictions) 

    print(column_model)        
    print('Total Matches :: {}'.format(total_matches))
    print('Valid Predicted Matches :: {}'.format(valid_matches))
    print('Actual Home Team Wins :: {}'.format(home_actual_win))
    print('Home Predicted Wins :: {}'.format(home_predicted_win))
    print('Actual Draws :: {}'.format(actual_draws))
    print('Predicted Draws :: {}'.format(predicted_draws))
    print('Correct Predictions :: {}'.format(correct_predictions))
    
    results_data['accuracy'] = results_data.apply(lambda r: model_libs.predictions_diff(r[column_model], r[actual]), axis=1)
    accuracy = np.divide(results_data['accuracy'].sum(), float(len(results_data['accuracy'])))
    print('Individual Accuracy :: {}'.format(accuracy))

    #print(invalid_predictions)
    

#### Removes half of the matches and verifies the accuracy of those predictions

In [None]:
if target == "points":
    abbreviated_matches = reordered_matches.iloc[1::2, :]
    #abbreviated_matches = reordered_matches.iloc[::2, :]
    abbreviated_matches['accuracy'] = abbreviated_matches.apply(lambda r: model_libs.predictions_diff(r[column_model], r["random"]), axis=1)
    accuracy = np.divide(abbreviated_matches['accuracy'].sum(), float(len(abbreviated_matches['accuracy'])))
    print('Accuracy :: {}'.format(accuracy))

In [None]:
reordered_matches['accuracy'] = reordered_matches.apply(lambda r: model_libs.predictions_diff(r['log'], r['actual']), axis=1)
home_score = pd.crosstab(reordered_matches['is_home'], reordered_matches['actual'])
home_score.plot(kind='bar', stacked=True)
home_score['Total'] = home_score.sum(axis=1)
home_score.loc['Total']= home_score.sum()
display(home_score)