## Combine attendances & scores dataframes

In [1]:
import pandas as pd
import os

In [2]:
epl_path = "../../Resources/EPL_teams/"
score_file = os.path.join(epl_path, "england_test_city.csv")
home_att_file = os.path.join(epl_path, "England_home_allmatches_att.csv")
output_file = os.path.join(epl_path, "EPL_scores_attendances.csv")

In [7]:
# Date string format
def Format_Date(df):
    # df is series with the key "Date"
    # expressed as month/day/year
    df2 = df.copy()
    one_digit = [str(i) for i in range(10)]
    for i, val in df.items():
        dates = val.split("/")
        if dates[0] in one_digit:
            dates[0] = "0"+dates[0]
        if dates[1] in one_digit:
            dates[1] = "0"+dates[1]
        if dates[2] == "19" or dates[2] == "20":
            dates[2] = "20"+dates[2]
        df2.loc[i] = dates[2]+"-"+dates[1]+"-"+dates[0]
    return df2

In [4]:
scores_df = pd.read_csv(score_file)
scores_df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City
0,E0,9/8/19,20:00,Liverpool,Norwich,4,1,H,EPL9,EPL13,Liverpool
1,E0,10/8/19,12:30,West Ham,Man City,0,5,A,EPL18,EPL10,London
2,E0,10/8/19,15:00,Bournemouth,Sheffield United,1,1,D,EPL2,EPL14,Bournemouth
3,E0,10/8/19,15:00,Burnley,Southampton,3,0,H,EPL4,EPL15,Burnley
4,E0,10/8/19,15:00,Crystal Palace,Everton,0,0,D,EPL6,EPL7,London


In [5]:
home_att_df = pd.read_csv(home_att_file)
att_df = home_att_df[["Team ID", "Date", "Opponent", "Attendance"]]

In [9]:
#Convert 'Date' column to unified format
scores_df["Date"] = Format_Date(scores_df["Date"])
att_df["Date"] = Format_Date(att_df["Date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# Groupby team: We will need to merge on Date.
# But, the score frame has several same values on Date column.
# To have unique values, we work on each team.
gp_scores = scores_df.groupby("HomeTeam ID")
gp_att = att_df.groupby("Team ID")

# The number of teams in this league
L = len(gp_scores.indices)

# For each team, merge two dataframes on Date and then append them.
scores_att = pd.DataFrame()
for i in range(L):
    gp_id = "EPL"+str(i)
    sc_df = gp_scores.get_group(gp_id)
    at_df = gp_att.get_group(gp_id)
    all_df = pd.merge(sc_df, at_df, on="Date", how="left")
    scores_att = scores_att.append(all_df)

In [15]:
scores_att.sort_values(by="Date", inplace=True)

In [17]:
scores_att.to_csv(output_file, index=False)