In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
attendance_path = "../../Resources/La_Liga/La_Liga_att_v1"
scores_city_file = "../../Resources/La_Liga/spain_city.csv"
output_file = "../../final-resources/LIG/LIG_scores_attendances.csv"

In [3]:
LIG_teams=["Alaves",  
"Ath_Bilbao",  
"Ath_Madrid",  
"Barcelona",  
"Betis",  
"Celta",  
"Eibar",  
"Espanol",  
"Getafe",  
"Granada",  
"Leganes",  
"Levante",  
"Mallorca",  
"Osasuna",  
"Real_Madrid",  
"Sevilla",  
"Sociedad",  
"Valencia",  
"Valladolid",  
"Villarreal"]

## Combine attendances data

* Home attendance dataframe
* Away atttendance dataframe

In [4]:
def Format_Date(df):
    # df is series with the key “Date”
    # expressed as month/day/year
    df2 = df.copy()
    one_digit = [str(i) for i in range(10)]
    for i, val in df.items():
        dates = val.split("/")
        if dates[0] in one_digit:
            dates[0] = "0"+dates[0]
        if dates[1] in one_digit:
            dates[1] = "0"+dates[1]
        if dates[2] == "19" or dates[2] == "20":
            dates[2] = "20"+dates[2]
        df2.loc[i] = dates[2]+"-"+dates[1]+"-"+dates[0]
    return df2

In [5]:
home_all_df = pd.DataFrame()
away_all_df = pd.DataFrame()
cols = ["Date", "Opponent", "Attendance"]
all_cols = ["Div", "Team ID", "Team"] + cols
for team in LIG_teams:
    
    # Read csv file
    att_file = os.path.join(attendance_path, team+".csv")
    att_df = pd.read_csv(att_file, encoding="ISO-8859-1")
    
    # Home attendances
    home_att_df = att_df.loc[att_df["H/A"]=="H", cols]
    home_att_df["Date"] = Format_Date(home_att_df["Date"])
    home_att_df["Div"] = "E0"
    home_att_df["Team"] = " ".join(team.split("_"))
    home_att_df["Team ID"] = "LIG"+str(LIG_teams.index(team))
    home_all_df = home_all_df.append(home_att_df[all_cols], ignore_index=True)

    # Away attendances
    away_att_df = att_df.loc[att_df["H/A"]=="H", cols]
    away_att_df["Date"] = Format_Date(away_att_df["Date"])
    away_att_df["Div"] = "E0"
    away_att_df["Team"] = " ".join(team.split("_"))
    away_att_df["Team ID"] = "LIG"+str(LIG_teams.index(team))
    away_all_df = away_all_df.append(away_att_df[all_cols], ignore_index=True)

In [6]:
home_all_df

Unnamed: 0,Div,Team ID,Team,Date,Opponent,Attendance
0,E0,LIG0,Alaves,2019-08-18,Levante,12029
1,E0,LIG0,Alaves,2019-08-25,Espanyol,14567
2,E0,LIG0,Alaves,2019-09-15,Sevilla,16309
3,E0,LIG0,Alaves,2019-09-29,Mallorca,17135
4,E0,LIG0,Alaves,2019-10-20,Celta Vigo,16584
...,...,...,...,...,...,...
395,E0,LIG19,Villarreal,2020-06-16,Mallorca,
396,E0,LIG19,Villarreal,2020-06-22,Sevilla,
397,E0,LIG19,Villarreal,2020-06-28,Valencia,
398,E0,LIG19,Villarreal,2020-07-05,Barcelona,


## Combine attendances and scores

In [7]:
scores_df = pd.read_csv(scores_city_file)

# Data cleaning, formating
scores_df = scores_df.drop(columns=["Unnamed: 11"])
scores_df["Date"] = Format_Date(scores_df["Date"])
scores_df

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City
0,SP1,2019-08-16,20:00,Ath Bilbao,Barcelona,1,0,H,LIG1,LIG3,Bilbao
1,SP1,2019-08-17,16:00,Celta,Real Madrid,1,3,A,LIG5,LIG14,Vigo
2,SP1,2019-08-17,18:00,Valencia,Sociedad,1,1,D,LIG17,LIG16,Valencia
3,SP1,2019-08-17,19:00,Mallorca,Eibar,2,1,H,LIG12,LIG6,Palma
4,SP1,2019-08-17,20:00,Leganes,Osasuna,0,1,A,LIG10,LIG13,Leganes
...,...,...,...,...,...,...,...,...,...,...,...
355,SP1,2020-07-12,18:30,Leganes,Valencia,1,0,H,LIG10,LIG17,Leganes
356,SP1,2020-07-12,21:00,Sevilla,Mallorca,2,0,H,LIG15,LIG12,Seville
357,SP1,2020-07-13,18:30,Alaves,Getafe,0,0,D,LIG0,LIG8,Vitoria-Gasteiz
358,SP1,2020-07-13,18:30,Villarreal,Sociedad,1,2,A,LIG19,LIG16,Villarreal


In [8]:
# Groupby team: We will need to merge on Date.
# But, the score frame has several same values on Date column.
# To have unique values, we work on each team.
att_cols = ["Team ID", "Date", "Opponent", "Attendance"]
gp_scores = scores_df.groupby("HomeTeam ID")
gp_att = home_all_df[att_cols].groupby("Team ID")

# The number of teams in this league
L = len(gp_scores.indices)

# For each team, merge two dataframes on Date and then append them.
scores_att = pd.DataFrame()

for i in range(L):
    gp_id = "LIG"+str(i)
    sc_df = gp_scores.get_group(gp_id)
    at_df = gp_att.get_group(gp_id)
    all_df = pd.merge(sc_df, at_df, on="Date", how="left")
    scores_att = scores_att.append(all_df)

In [9]:
scores_att

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City,Team ID,Opponent,Attendance
0,SP1,2019-08-18,16:00,Alaves,Levante,1,0,H,LIG0,LIG11,Vitoria-Gasteiz,LIG0,Levante,12029
1,SP1,2019-08-25,16:00,Alaves,Espanol,0,0,D,LIG0,LIG7,Vitoria-Gasteiz,LIG0,Espanyol,14567
2,SP1,2019-09-15,13:00,Alaves,Sevilla,0,1,A,LIG0,LIG15,Vitoria-Gasteiz,LIG0,Sevilla,16309
3,SP1,2019-09-29,15:00,Alaves,Mallorca,2,0,H,LIG0,LIG12,Vitoria-Gasteiz,LIG0,Mallorca,17135
4,SP1,2019-10-20,11:00,Alaves,Celta,2,0,H,LIG0,LIG5,Vitoria-Gasteiz,LIG0,Celta Vigo,16584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,SP1,2020-06-16,18:30,Villarreal,Mallorca,1,0,H,LIG19,LIG12,Villarreal,LIG19,Mallorca,
14,SP1,2020-06-22,18:30,Villarreal,Sevilla,2,2,D,LIG19,LIG15,Villarreal,LIG19,Sevilla,
15,SP1,2020-06-28,16:00,Villarreal,Valencia,2,0,H,LIG19,LIG17,Villarreal,LIG19,Valencia,
16,SP1,2020-07-05,21:00,Villarreal,Barcelona,1,4,A,LIG19,LIG3,Villarreal,LIG19,Barcelona,


In [10]:
scores_att.sort_values(by="Date", inplace=True)
scores_att.to_csv(output_file, index=False)