# **ORGANIZE DYNAMIC AND ODDS DATA**

## Imports

In [None]:
# Import needed packages
import pandas as pd
import numpy as np
import json
import collections
# Change Pandas rows and columns' options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Functions

In [None]:
# Turn a nested dictionary into a flattened dictionary
def flatten(d, parent_key='', sep='_'): 
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [None]:
# Extract data from a dataframe's column (which contains list objects) and organize it in a new dataframe
def listcolumn_to_df(starting_df, column_toextract):
    all_matches = []
    for j in range(len(starting_df)):
        match_FT = {}
        single_match = eval(starting_df[column_toextract].iloc[j])
        for i in range(len(single_match)):
            if column_toextract in ['substitutions', 'lineup', 'bench', 'sidelined']:
                match_FT[column_toextract[:4].upper() + "pl{0}".format(i+1)] = single_match[i]
            else:
                match_FT[column_toextract[:4].upper() + "{0}".format(i+1)] = single_match[i]
        all_matches.append(flatten(match_FT))
    df = pd.DataFrame(all_matches) 
    df.dropna(how = 'all', inplace = True)
    return df 

In [None]:
# Keeps only bets from either a JSON file or a Python object that appear more than a certain treshold value (n_tokeep)
def keep_n_mostcommon_odds(jsonfilepath_or_object, n_tokeep):
    if isinstance(jsonfilepath_or_object, str):
        f = open(jsonfilepath_or_object)
        odds_data = json.load(f)
        f.close()
    elif isinstance(jsonfilepath_or_object, object):
        odds_data = jsonfilepath_or_object

    feat = [key for fix in odds_data for key in fix.keys()]
    feat_count = collections.Counter(feat).most_common()
    frequent_feats = {x[0] for x in feat_count if x[1] > n_tokeep}
    new_odds = [{k: fix[k] for k in frequent_feats if k in fix} for fix in odds_data]
    
    check = {key for i in new_odds for key in i.keys()}
    if check == frequent_feats:
        print('Check for correct keys: OK')
        print('Features with at least ', n_tokeep, 'observations: ', len(frequent_feats))
    else: 
        print('Check for correct keys: ERROR') 
    return new_odds

In [None]:
# Add columns to the Dataframe for both the captain of team1 and team2, and drop all other captain variables in the Df 
def handling_captains(initial_df):
    captains = {}
    cap1_temp = {}
    cap2_temp = {}
    for ob in range(len(initial_df)):
        match_caps = []
        for num, col in enumerate(initial_df.filter(regex='captain$').columns):
            if initial_df[col].iloc[ob] is True:
                match_caps.append(num+1)
        captains[initial_df.index[ob]] = match_caps
    
    for key, value in captains.items():
        if len(value) == 2:
            cap1_temp[key] = value[0]
            cap2_temp[key] = value[1]
        else:
            cap1_temp[key] = cap2_temp[key] = np.nan
    
    initial_df = initial_df.merge(pd.Series(cap1_temp).rename('captain_team1'), left_index=True, right_index=True).merge(
        pd.Series(cap2_temp).rename('captain_team2'), left_index=True, right_index=True)
    initial_df.drop(initial_df.filter(regex='_captain$').columns, axis = 1, inplace=True)
    return initial_df

# **ALL DYNAMIC DATA**

### **Starting Data**

In [None]:
# Import leagues_more from csv
leagues_more = pd.read_csv('Input Data/leagues_more.csv', low_memory = False)
leagues_more.set_index('id', inplace = True)
leagues_more.tail() 

### **Lineup Data**

In [None]:
# Create lineup Dataframe
lineups_df = listcolumn_to_df(leagues_more, 'lineup')
# Create new columns 
lineups_df['team_id_TEAM1'] = lineups_df['LINEpl1_team_id']
lineups_df['team_id_TEAM2'] = lineups_df['LINEpl17_team_id']
lineups_df['Fix_ID'] = lineups_df['LINEpl1_fixture_id'].astype('int')
# Set Fixture_id as index
lineups_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
lineups_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain values 
lineups_df = lineups_df.loc[:,~lineups_df.columns.str.endswith('posx')]
lineups_df = lineups_df.loc[:,~lineups_df.columns.str.endswith('posy')]
lineups_df = lineups_df.loc[:,~lineups_df.columns.str.endswith('number')]
lineups_df.drop(lineups_df.filter(regex='_type$').columns, axis = 1, inplace=True)
lineups_df.drop(lineups_df.filter(regex='_team_id$').columns, axis = 1, inplace=True)
lineups_df.drop(lineups_df.filter(regex='formation_position$').columns, axis = 1, inplace=True)
lineups_df.drop(lineups_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
# Call handling_captains() function
lineups_df = handling_captains(lineups_df)
# Transform all floats in integers
m = lineups_df.select_dtypes(np.number)
lineups_df[m.columns]= m.round().astype('Int64')
# Transform rating from obj to float
lineups_df[lineups_df.filter(regex='_rating$').columns] = lineups_df.filter(regex='_rating$').astype('float64')
# Change the order of lineups_df columns
cols = lineups_df.columns.tolist()
cols = cols[-4:] + cols[:-4]
lineups_df = lineups_df[cols]

In [None]:
print(lineups_df.shape)
lineups_df.tail()

### **Subs Data**

In [None]:
# Create subs Dataframe
subs_df = listcolumn_to_df(leagues_more, 'substitutions')
# Create new columns
subs_df['Fix_ID'] = subs_df['SUBSpl1_fixture_id'].astype('int')
# Set Fixture_id as index
subs_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
subs_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain patterns 
subs_df.drop(subs_df.filter(regex='SUBSpl\d{1,2}_id$').columns, axis = 1, inplace=True)
subs_df.drop(subs_df.filter(regex='_type$').columns, axis = 1, inplace=True)
subs_df.drop(subs_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
subs_df.drop(subs_df.filter(regex='_extra_minute$').columns, axis = 1, inplace=True)
# Tranform injured columns to binary variable
for i in subs_df.filter(regex='_injuried$').columns:
    subs_df[i] = subs_df[i].map({True: 1, None: 0, False: 0})
# Transform all floats in integers
m1 = subs_df.select_dtypes(np.number)
subs_df[m1.columns]= m1.round().astype('Int64')

In [None]:
print(subs_df.shape)
subs_df.tail()

### **Bench Data**

In [None]:
# Create bench Dataframe
bench_df = listcolumn_to_df(leagues_more, 'bench')
# Create new columns 
bench_df['Fix_ID'] = bench_df['BENCpl1_fixture_id'].astype('int')
# Set Fixture_id as index
bench_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
bench_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain values 
bench_df = bench_df.loc[:,~bench_df.columns.str.endswith('posx')]
bench_df = bench_df.loc[:,~bench_df.columns.str.endswith('posy')]
bench_df = bench_df.loc[:,~bench_df.columns.str.endswith('number')]
bench_df = bench_df.loc[:,~bench_df.columns.str.endswith('captain')]
bench_df.drop(bench_df.filter(regex='_type$').columns, axis = 1, inplace=True)
bench_df.drop(bench_df.filter(regex='formation_position$').columns, axis = 1, inplace=True)
bench_df.drop(bench_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
# Transform all floats in integers
m2 = bench_df.select_dtypes(np.number)
bench_df[m2.columns]= m2.round().astype('Int64')
# Transform rating from obj to float
bench_df[bench_df.filter(regex='_rating$').columns] = bench_df.filter(regex='_rating$').astype('float64')

In [None]:
print(bench_df.shape)
bench_df.tail()

### **Sidelined Data**

In [None]:
# Create side Dataframe
side_df = listcolumn_to_df(leagues_more, 'sidelined')
# Create new columns 
side_df['Fix_ID'] = side_df['SIDEpl1_fixture_id'].astype('int')
# Set Fixture_id as index
side_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
side_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain values 
side_df.drop(side_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
# Transform all floats in integers
m3 = side_df.select_dtypes(np.number)
side_df[m3.columns]= m3.round().astype('Int64')

In [None]:
print(side_df.shape)
side_df.tail()

### **Corners Data**

In [None]:
# Create corners Dataframe
corners_df = listcolumn_to_df(leagues_more, 'corners')
# Create new columns 
corners_df['Fix_ID'] = corners_df['CORN1_fixture_id'].astype('int')
# Set Fixture_id as index
corners_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
corners_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain pattern
corners_df.drop(corners_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
corners_df.drop(corners_df.filter(regex='_comment$').columns, axis = 1, inplace=True)
corners_df.drop(corners_df.filter(regex='_extra_minute$').columns, axis = 1, inplace=True)
corners_df.drop(corners_df.filter(regex='CORN\d{1,2}_id$').columns, axis = 1, inplace=True)
# Transform all floats in integers
m4 = corners_df.select_dtypes(np.number)
corners_df[m4.columns]= m4.round().astype('Int64')

In [None]:
print(corners_df.shape)
corners_df.tail()

### **Cards Data**

In [None]:
# Create cards Dataframe
cards_df = listcolumn_to_df(leagues_more, 'cards')
# Create new columns 
cards_df['Fix_ID'] = cards_df['CARD1_fixture_id'].astype('int')
# Set Fixture_id as index
cards_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
cards_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain pattern
cards_df.drop(cards_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
cards_df.drop(cards_df.filter(regex='_extra_minute$').columns, axis = 1, inplace=True)
cards_df.drop(cards_df.filter(regex='CARD\d{1,2}_id$').columns, axis = 1, inplace=True)
# Tranform type columns to shorter codes
for i in cards_df.filter(regex='_type$').columns:
    cards_df[i] = cards_df[i].map({'yellowcard': 'Y', 'redcard': 'R', 'yellowred': 'YR'})
# Transform all floats in integers
m5 = cards_df.select_dtypes(np.number)
cards_df[m5.columns]= m5.round().astype('Int64')

In [None]:
print(cards_df.shape)
cards_df.tail()

### **Goals Data**

In [None]:
# Create goals Dataframe
goals_df = listcolumn_to_df(leagues_more, 'goals')
# Create new columns 
goals_df['Fix_ID'] = goals_df['GOAL1_fixture_id'].astype('int')
# Set Fixture_id as index
goals_df.set_index('Fix_ID', inplace=True)
# Drop all columns that have only NAs 
goals_df.dropna(axis=1, how='all', inplace=True)
# Drop all columns which name match certain pattern
goals_df.drop(goals_df.filter(regex='GOAL\d{1,2}_id$').columns, axis = 1, inplace=True)#
goals_df.drop(goals_df.filter(regex='_extra_minute$').columns, axis = 1, inplace=True)
goals_df.drop(goals_df.filter(regex='_fixture_id$').columns, axis = 1, inplace=True)
goals_df.drop(goals_df.filter(regex='_reason$').columns, axis = 1, inplace=True)
# Tranform type columns to shorter codes
for i in goals_df.filter(regex='_type$').columns:
    goals_df[i] = goals_df[i].map({'goal': 'G', 'penalty': 'P', 'own-goal': 'O-G'})
# Transform all floats in integers
m6 = goals_df.select_dtypes(np.number)
goals_df[m6.columns]= m6.round().astype('Int64')

In [None]:
print(goals_df.shape)
goals_df.tail()

# **ODDS DATA**

### Get Data

In [None]:
only_leagues_odds = keep_n_mostcommon_odds('Input Data/final_leagues_odds.json', 3500)
only_cups_odds = keep_n_mostcommon_odds('Input Data/final_cups_odds.json', 2000)

with open('Input Data/final_leagues_odds.json') as f: odds_data_leagues = json.load(f)
f.close()
with open('Input Data/final_cups_odds.json') as g: odds_data_cups = json.load(g)
g.close()

complete_odds = odds_data_cups + odds_data_leagues
merged_odds = keep_n_mostcommon_odds(complete_odds, 5000) 

### **Leagues**

In [None]:
leagues_odds_df = pd.DataFrame(only_leagues_odds)
leagues_odds_df.set_index('id', inplace=True)
leagues_odds_df.dropna(axis=0, how='all', inplace=True)
leagues_odds_df.dropna(axis=1, how='all', inplace=True)
leagues_odds_df.drop(leagues_odds_df.filter(regex='^ToQualify_').columns, axis = 1, inplace=True)

In [None]:
print(leagues_odds_df.shape)
leagues_odds_df.tail()