In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
attendance_path = "../../Resources/Serie_A/Italian_Serie_A_att_v1"
scores_city_file = "../../Resources/Serie_A/Italy_city.csv"
output_file = "../../final-resources/SER/SER_scores_attendances.csv"

In [3]:
SER_teams=["Atalanta",
"Bologna",
"Brescia",
"Cagliari",
"Fiorentina",
"Genoa",
"Inter",
"Juventus",
"Lazio",
"Lecce",
"Milan",
"Napoli",
"Parma",
"Roma",
"Sampdoria",
"Sassuolo",
"Spal",
"Torino",
"Udinese",
"Verona"]

## Combine attendances data

* Home attendance dataframe
* Away atttendance dataframe

In [4]:
def Format_Date(df):
    # df is series with the key “Date”
    # expressed as month/day/year
    df2 = df.copy()
    one_digit = [str(i) for i in range(10)]
    for i, val in df.items():
        dates = val.split("/")
        if dates[0] in one_digit:
            dates[0] = "0"+dates[0]
        if dates[1] in one_digit:
            dates[1] = "0"+dates[1]
        if dates[2] == "19" or dates[2] == "20":
            dates[2] = "20"+dates[2]
        df2.loc[i] = dates[2]+"-"+dates[1]+"-"+dates[0]
    return df2

In [5]:
home_all_df = pd.DataFrame()
away_all_df = pd.DataFrame()
cols = ["Date", "Opponent", "Attendance"]
all_cols = ["Div", "Team ID", "Team"] + cols
for team in SER_teams:
    
    # Read csv file
    att_file = os.path.join(attendance_path, team+".csv")
    att_df = pd.read_csv(att_file, encoding="ISO-8859-1")
    
    # Home attendances
    home_att_df = att_df.loc[att_df["H/A"]=="H", cols]
    home_att_df["Date"] = Format_Date(home_att_df["Date"])
    home_att_df["Div"] = "E0"
    home_att_df["Team"] = " ".join(team.split("_"))
    home_att_df["Team ID"] = "SER"+str(SER_teams.index(team))
    home_all_df = home_all_df.append(home_att_df[all_cols], ignore_index=True)

    # Away attendances
    away_att_df = att_df.loc[att_df["H/A"]=="H", cols]
    away_att_df["Date"] = Format_Date(away_att_df["Date"])
    away_att_df["Div"] = "E0"
    away_att_df["Team"] = " ".join(team.split("_"))
    away_att_df["Team ID"] = "SER"+str(SER_teams.index(team))
    away_all_df = away_all_df.append(away_att_df[all_cols], ignore_index=True)

In [6]:
home_all_df

Unnamed: 0,Div,Team ID,Team,Date,Opponent,Attendance
0,E0,SER0,Atalanta,2019-09-01,Torino,8182
1,E0,SER0,Atalanta,2019-09-22,Fiorentina,
2,E0,SER0,Atalanta,2019-10-01,Shakhtar Donetsk,26022
3,E0,SER0,Atalanta,2019-10-06,Lecce,20771
4,E0,SER0,Atalanta,2019-10-27,Udinese,
...,...,...,...,...,...,...
351,E0,SER19,Verona,2020-02-08,Juventus,
352,E0,SER19,Verona,2020-06-20,Cagliari,
353,E0,SER19,Verona,2020-06-23,Napoli,
354,E0,SER19,Verona,2020-07-01,Parma,


## Combine attendances and scores

In [7]:
scores_df = pd.read_csv(scores_city_file)

# Data cleaning, formating
scores_df = scores_df.drop(columns=["Unnamed: 11"])
scores_df["Date"] = Format_Date(scores_df["Date"])
scores_df

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City
0,I1,2019-08-24,17:00,Parma,Juventus,0,1,A,SER12,SER7,Parma
1,I1,2019-08-24,19:45,Fiorentina,Napoli,3,4,A,SER4,SER11,Florence
2,I1,2019-08-25,17:00,Udinese,Milan,1,0,H,SER18,SER10,Udine
3,I1,2019-08-25,19:45,Cagliari,Brescia,0,1,A,SER3,SER2,Cagliari
4,I1,2019-08-25,19:45,Roma,Genoa,3,3,D,SER13,SER5,Rome
...,...,...,...,...,...,...,...,...,...,...,...
315,I1,2020-07-12,18:30,Fiorentina,Verona,1,1,D,SER4,SER19,Florence
316,I1,2020-07-12,18:30,Parma,Bologna,2,2,D,SER12,SER1,Parma
317,I1,2020-07-12,18:30,Udinese,Sampdoria,1,3,A,SER18,SER14,Udine
318,I1,2020-07-12,20:45,Napoli,Milan,2,2,D,SER11,SER10,Naples


In [8]:
# Groupby team: We will need to merge on Date.
# But, the score frame has several same values on Date column.
# To have unique values, we work on each team.
att_cols = ["Team ID", "Date", "Opponent", "Attendance"]
gp_scores = scores_df.groupby("HomeTeam ID")
gp_att = home_all_df[att_cols].groupby("Team ID")

# The number of teams in this league
L = len(gp_scores.indices)

# For each team, merge two dataframes on Date and then append them.
scores_att = pd.DataFrame()

for i in range(L):
    gp_id = "SER"+str(i)
    sc_df = gp_scores.get_group(gp_id)
    at_df = gp_att.get_group(gp_id)
    all_df = pd.merge(sc_df, at_df, on="Date", how="left")
    scores_att = scores_att.append(all_df)

In [9]:
scores_att

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City,Team ID,Opponent,Attendance
0,I1,2019-09-01,19:45,Atalanta,Torino,2,3,A,SER0,SER17,Bergamo,SER0,Torino,8182
1,I1,2019-09-22,17:00,Atalanta,Fiorentina,2,2,D,SER0,SER4,Bergamo,SER0,Fiorentina,
2,I1,2019-10-06,14:00,Atalanta,Lecce,3,1,H,SER0,SER9,Bergamo,SER0,Lecce,20771
3,I1,2019-10-27,14:00,Atalanta,Udinese,7,1,H,SER0,SER18,Bergamo,SER0,Udinese,
4,I1,2019-11-03,11:30,Atalanta,Cagliari,0,2,A,SER0,SER3,Bergamo,SER0,Cagliari,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,I1,2020-02-08,19:45,Verona,Juventus,2,1,H,SER19,SER7,Verona,SER19,Juventus,
12,I1,2020-06-20,20:45,Verona,Cagliari,2,1,H,SER19,SER3,Verona,SER19,Cagliari,
13,I1,2020-06-23,18:30,Verona,Napoli,0,2,A,SER19,SER11,Verona,SER19,Napoli,
14,I1,2020-07-01,20:45,Verona,Parma,3,2,H,SER19,SER12,Verona,SER19,Parma,


In [10]:
scores_att.sort_values(by="Date", inplace=True)
scores_att.to_csv(output_file, index=False)