# Football Match Predictor - Data pipeline and SQL database

## Library Imports

In [1]:
# Add all the library imports required
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
import json

import data_cleaning
import feature_engineering

## Generate Pipeline

In [2]:
# Import all the data and merge into one data frame
merged_data_df = data_cleaning.import_and_merge_data_pipeline()

In [3]:
# Clean all the data
cleaned_df = data_cleaning.clean_data_pipeline(merged_data_df)

cleaned_df.isna().sum().sum()

0

## Feature Engineering

In [4]:
home_away_features_df = feature_engineering.generate_home_away_features(cleaned_df)
home_data_transformed, away_data_transformed = feature_engineering.generate_team_season_features(cleaned_df)

transformed_df = (home_away_features_df
    .join(home_data_transformed)
    .join(away_data_transformed)
)
transformed_df.sample(5)

Unnamed: 0_level_0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,...,home_team_home_form,away_team_away_form,home_team_home_total_goals,away_team_away_total_goals,home_form,home_total_goals,home_discipline,away_form,away_total_goals,away_discipline
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bayern-munchen/karlsruher-sc/1992,Bayern München,Karlsruher SC,1-0,https://www.besoccer.com/match/bayern-munchen/...,1992,19,bundesliga,1,0,home_win,...,4.0,4.0,14.0,18.0,6.0,23.0,2.2,7.0,24.0,2.4
salernitana-calcio-1919/palermo/2018,Salernitana,Palermo FC,0-2,https://www.besoccer.com/match/salernitana-cal...,2018,42,serie_b,0,2,away_win,...,6.0,4.0,28.0,25.0,7.0,51.0,3.6,8.0,57.0,1.2
sv-sandhausen/eintracht-brau-hellip/2017,Sandhausen,Braunschweig,0-1,https://www.besoccer.com/match/sv-sandhausen/e...,2017,22,2_liga,0,1,away_win,...,4.0,3.0,15.0,7.0,7.0,29.0,2.4,4.0,33.0,2.6
rm-castilla/lorca-deportiva-cf-sad/2006,RM Castilla,Lorca Deportiva,1-2,https://www.besoccer.com/match/rm-castilla/lor...,2006,8,segunda_division,1,2,away_win,...,3.0,3.0,6.0,5.0,4.0,12.0,2.0,6.0,7.0,2.0
toulouse-fc/paris-saint-germain-fc/2006,Toulouse,PSG,1-0,https://www.besoccer.com/match/toulouse-fc/par...,2006,21,ligue_1,1,0,home_win,...,6.0,4.0,10.0,8.0,7.0,19.0,1.8,7.0,26.0,1.8


In [5]:
transformed_df.isna().sum().sum()

0

In [6]:
transformed_df.columns

Index(['home_team', 'away_team', 'score', 'link', 'season_year', 'match_round',
       'league', 'home_goals', 'away_goals', 'result', 'home_points',
       'away_points', 'capacity', 'missing_cards', 'missing_elo',
       'home_yellow', 'home_red', 'home_elo', 'away_yellow', 'away_red',
       'away_elo', 'home_team_home_form', 'away_team_away_form',
       'home_team_home_total_goals', 'away_team_away_total_goals', 'home_form',
       'home_total_goals', 'home_discipline', 'away_form', 'away_total_goals',
       'away_discipline'],
      dtype='object')

## SQL Database

In [6]:
f = open('sql_details.json')
sql_details = json.load(f)
f.close()

In [7]:
DATABASE_TYPE = 'postgresql'
DBAPI = 'psycopg2'
HOST = 'localhost'
USER = 'postgres'
PASSWORD = sql_details['password']
DATABASE = sql_details['database']
PORT = 5432
engine = create_engine(f"{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

cleaned_df.to_sql('cleaned_dataset', engine, if_exists='replace')

619