# Run feature analysis on match data

## Import Packages

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

## Query statistics table from Postgres for analysis

In [None]:
db_user = 'db_user'    # Replace with your PostgreSQL username
db_password = 'db_password'  # Replace with your PostgreSQL password
db_host = 'localhost'      # Replace with your PostgreSQL host (e.g., localhost or IP)
db_port = '5432'           # PostgreSQL port (default is 5432)
db_name = 'db_name'    

connection_string = f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create the SQLAlchemy engine
engine = create_engine(connection_string)

#Create query
query = "SELECT * FROM match_statistics"
# Write the DataFrame to a PostgreSQL table
try:
    # Write the DataFrame to a table named 'your_table_name', replace it with your desired table name
    df = pd.read_sql(query, engine)

    print("Table created successfully.")
    
finally:
    # Ensure connection is closed
    engine.dispose()
    print("Connection closed.")

Table created successfully.
Connection closed.


### See percentage of games that have each result

In [4]:
#checking class proportions
class_p = (df.winner.value_counts(normalize = True) *100).round(2).reset_index()
class_p.columns = ['winner', '%']
class_p

Unnamed: 0,winner,%
0,HOME_TEAM,45.79
1,AWAY_TEAM,29.79
2,DRAW,24.43


## See correlation between fields and winners

In [5]:
#creating dummy dependent variables and set of columns to check their correlation with the dependent variables
num_cols = df.dtypes[df.dtypes != 'object'].index.tolist()

cols_to_drop = ['season', 'match_name','date', 'home_team', 'away_team', 'home_score', 'away_score',
                'home_match_points', 'away_match_points']


corr_cols = list(set(num_cols) - set(cols_to_drop))

df['winner_h'] = np.where(df.winner == 'HOME_TEAM', 1, 0)
df['winner_a'] = np.where(df.winner == 'AWAY_TEAM', 1, 0)
df['winner_d'] = np.where(df.winner == 'DRAW', 1, 0)

df[corr_cols + ['winner_h']].corr()['winner_h'].sort_values(ascending = False).reset_index()

Unnamed: 0,index,winner_h
0,winner_h,1.0
1,home_goals,0.639718
2,away_odds,0.361542
3,home_away_points_interaction,0.206301
4,away_rank,0.19917
5,draw_odds,0.192867
6,home_similar_rank_goals,0.185756
7,home_similar_rank_win_ratio,0.17068
8,home_wins,0.167659
9,home_points,0.15229


In [42]:
df[corr_cols + ['winner_a']].corr()['winner_a'].sort_values(ascending = False).reset_index()

Unnamed: 0,index,winner_a
0,winner_a,1.0
1,away_goals,0.63628
2,home_odds,0.378431
3,home_rank,0.187634
4,away_similar_rank_goals,0.167347
5,away_similar_rank_win_ratio,0.155771
6,away_wins,0.137152
7,away_win_streak,0.134682
8,away_last_3_avg_pts,0.123761
9,away_similar_rank_goal_ratio,0.12183


In [11]:
df[corr_cols + ['winner_d']].corr()['winner_d'].sort_values(ascending = False).reset_index()

Unnamed: 0,index,winner_d
0,winner_d,1.0
1,home_rank,0.049404
2,home_last_3_wavg_goals_against,0.032554
3,home_odds,0.031402
4,home_last_3_goals_against,0.02878
5,away_last_3_avg_pts,0.022725
6,away_last_3_wavg_pts,0.022049
7,away_last_3_goals,0.020944
8,home_loss_streak,0.020356
9,away_last_3_wavg_goals,0.017219
