In [31]:
import pandas as pd
import numpy as np
import helper

In [32]:
# Read data from the CSV into a dataframe

datasets_loc = "~/Developer/AI/robin_hood/datasets/LaLiga/"

raw_data_FMEL = pd.read_csv(datasets_loc + 'FMEL_Dataset.csv')

First of all, we'll create two dictionaries: one to save the values of all the teams available and the other one to save the integer values corresponding to the FinalResult of the game -> HOME, AWAY, DRAW. 
The localTeam column is enough to list all the teams available.

In [33]:
def create_teams_dictionary(df):
    teams_dict = {}
    for i in range(len(df)):
        teams_dict[i] = df.loc[i]
    return teams_dict

teams = raw_data_FMEL['localTeam']

teams_without_duplicates = teams.drop_duplicates(keep='first')
teams_without_duplicates.reset_index(inplace=True, drop=True)

teams_dict = create_teams_dictionary(teams_without_duplicates)

helper.save(teams_dict, 'teams_dict.p')

In [34]:
final_result_dict = {0 : 'DRAW',
                     1 : 'HOME',
                     2 : 'AWAY'}

helper.save(final_result_dict, 'final_result_dict.p')

In [82]:
# Gets statistics for first try of model
                      
columns_req = ['season','division','round','localTeam','visitorTeam','localGoals','visitorGoals']

games_df = raw_data_FMEL[columns_req]

In [83]:
games_df.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals
0,1970-71,1,1,Athletic Club,Barcelona,1,1
1,1970-71,1,1,Las Palmas,Atletico de Madrid,1,1
2,1970-71,1,1,Real Madrid,Valencia,2,0
3,1970-71,1,1,Celta de Vigo,Sporting de Gijon,2,0
4,1970-71,1,1,Elche,Granada,1,1


In [84]:
# Filter by division

def filter_by_division(df, division):
    for i in range(len(df)):
        if i % 5000 == 0:
            print('Filtered ', i,' records')
        if df.loc[i, 'division'] != division:
            df = df.drop(i)
    df.reset_index(inplace=True, drop=True)
    return df
        
first_division_games_df = filter_by_division(games_df, 1)
second_division_games_df = filter_by_division(games_df, 2)

Filtered  0  records
Filtered  5000  records
Filtered  10000  records
Filtered  15000  records
Filtered  20000  records
Filtered  25000  records
Filtered  30000  records
Filtered  35000  records
Filtered  0  records
Filtered  5000  records
Filtered  10000  records
Filtered  15000  records
Filtered  20000  records
Filtered  25000  records
Filtered  30000  records
Filtered  35000  records


In [86]:
first_division_games_df.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals
0,1970-71,1,1,Athletic Club,Barcelona,1,1
1,1970-71,1,1,Las Palmas,Atletico de Madrid,1,1
2,1970-71,1,1,Real Madrid,Valencia,2,0
3,1970-71,1,1,Celta de Vigo,Sporting de Gijon,2,0
4,1970-71,1,1,Elche,Granada,1,1


In [87]:
# Save dataframes to files by division

helper.save(first_division_games_df, 'first_division_games.p')
helper.save(second_division_games_df, 'second_division_games.p')

# first_division_games_df.to_csv("/Users/ericmassip/Developer/AI/robin_hood/predict-scores-laliga/tmpfiles/first_division_games.csv", index=False)
# second_division_games_df.to_csv("/Users/ericmassip/Developer/AI/robin_hood/predict-scores-laliga/tmpfiles/second_division_games.csv", index=False)

# Checkpoint

Dataframes for every division are now filtered and saved into files.

In [57]:
import pandas as pd
import numpy as np
import helper

# Read from dataframe files filtered by division

# tmpfiles_loc = "/Users/ericmassip/Developer/AI/robin_hood/predict-scores-laliga/tmpfiles/"

#first_division_games_df = pd.read_csv(tmpfiles_loc + 'first_division_games.csv')
#second_division_games_df = pd.read_csv(tmpfiles_loc + 'second_division_games.csv')

first_division_games = helper.load('first_division_games.p')
second_division_games = helper.load('second_division_games.p')

In [58]:
first_division_games.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals
0,1970-71,1,1,Athletic Club,Barcelona,1,1
1,1970-71,1,1,Las Palmas,Atletico de Madrid,1,1
2,1970-71,1,1,Real Madrid,Valencia,2,0
3,1970-71,1,1,Celta de Vigo,Sporting de Gijon,2,0
4,1970-71,1,1,Elche,Granada,1,1


In [59]:
# Process local and visitor goals and set FinalResult column

def process_finalresult(df):
    data = {'FinalResult' : np.arange(len(df))}
    final_result_column_df = pd.DataFrame(data)
    final_result_column_df = final_result_column_df.astype(str, copy=False)
    for i in range(len(df)):
        localGoals = df.loc[i, 'localGoals']
        visitorGoals = df.loc[i, 'visitorGoals']
        final_result_column_df.at[i, 'FinalResult'] = get_final_result(localGoals, visitorGoals)
    df = df.assign(FinalResult = final_result_column_df)
    return df

def get_final_result(localGoals, visitorGoals):
    if localGoals > visitorGoals:
        return final_result_dict['HOME']
    elif localGoals < visitorGoals:
        return final_result_dict['AWAY']
    else:
        return final_result_dict['DRAW']

final_result_dict = helper.load('final_result_dict.p')
final_result_dict = helper.invert_dict(final_result_dict)

first_division_games_df = process_finalresult(first_division_games)
second_division_games_df = process_finalresult(second_division_games)

In [61]:
first_division_games_df.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals,FinalResult
0,1970-71,1,1,Athletic Club,Barcelona,1,1,0
1,1970-71,1,1,Las Palmas,Atletico de Madrid,1,1,0
2,1970-71,1,1,Real Madrid,Valencia,2,0,1
3,1970-71,1,1,Celta de Vigo,Sporting de Gijon,2,0,1
4,1970-71,1,1,Elche,Granada,1,1,0


In [62]:
second_division_games_df.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals,FinalResult
0,1970-71,2,1,Cadiz,Villarreal,1,2,2
1,1970-71,2,1,Deportivo,Mallorca,2,1,1
2,1970-71,2,1,Moscardo,Rayo Vallecano,0,3,2
3,1970-71,2,1,Calvo Sotelo,Burgos,0,1,2
4,1970-71,2,1,Ontinyent,Pontevedra,2,2,0


In [63]:
helper.save(first_division_games_df, 'first_division_games_df.p')
helper.save(second_division_games_df, 'second_division_games_df.p')

# Checkpoint

Dataframes processed to include FinalResult column

In [53]:
import pandas as pd
import numpy as np
import helper
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper

first_division_games_df = helper.load('first_division_games_df.p')
second_division_games_df = helper.load('second_division_games_df.p')
teams_dict = helper.load('teams_dict.p')

In [54]:
# Preprocess features to be available for prediction
def preprocessing_teams(df):
    inverse_teams_dict = helper.invert_dict(teams_dict)
    for i in range(len(df)):
        df.at[i, 'localTeam'] = inverse_teams_dict[df.loc[i, 'localTeam']]
        df.at[i, 'visitorTeam'] = inverse_teams_dict[df.loc[i, 'visitorTeam']]
    return df

first_division_games_df = preprocessing_teams(first_division_games_df)
second_division_games_df = preprocessing_teams(second_division_games_df)

mapper = DataFrameMapper([('season', preprocessing.LabelEncoder())], default=None, df_out=True)

In [55]:
first_division_games_df = mapper.fit_transform(first_division_games_df)
second_division_games_df = mapper.fit_transform(second_division_games_df)

In [56]:
first_division_games_df.head()

Unnamed: 0,season,division,round,localTeam,visitorTeam,localGoals,visitorGoals,FinalResult
0,0,1,1,0,10,1,1,0
1,0,1,1,1,9,1,1,0
2,0,1,1,2,8,2,0,1
3,0,1,1,3,11,2,0,1
4,0,1,1,4,12,1,1,0


In [None]:
second_division_games_df.head()

In [None]:
helper.save(first_division_games_preprocessed, 'first_division_games_preprocessed.p')
helper.save(second_division_games_df, 'second_division_games_df.p')