## Data cleaning and combinng

* Combine each team's attendance table to build England league attendance table

* Combine attendance table & score table

In [1]:
import pandas as pd
import os

In [2]:
# The path for EPL team data
epl_path = "../../Resources/EPL_teams/"

# The path for each team's home attendance data
home_path = epl_path + "Home"

# The path for each team's away attendance data
away_path = epl_path + "Away"

# scores data for all teams in EPL
score_file = os.path.join(epl_path, "england_test_city.csv")


home_att_file = os.path.join(epl_path, "England_home_allmatches_att.csv")

# output : all scores and attendances in EPL
output_file = os.path.join("../../final-resources/EPL", "EPL_scores_attendances.csv")

### One attendance table for England league

In [3]:
# This order gives the team ID.   EPL{index}, i.e. Arsenal=EPL0
EPL_teams = [
 'Arsenal',
 'Aston',
 'Bournemouth',
 'Brighton',
 'Burnley',
 'Chelsea',
 'Crystal',
 'Everton',
 'Leicester',
 'Liverpool',
 'Man_City',
 'Man_United',
 'Newcastle',
 'Norwich',
 'Sheffield',
 'Southampton',
 'Tottenham',
 'Watford',
 'WestHam',
 'Wolves']

In [4]:
# Initialize home dataframe and away dataframe
cols = ["Div", "Team", "Team ID", "Date", "Opponent", "Attendance"]
home_all_df = pd.DataFrame(columns=cols)
away_all_df = pd.DataFrame(columns=cols)

# Read the team's attendance csv file and append to the 'all' dataframe
for team in EPL_teams:
    home_file = os.path.join(home_path, team+"_home_attendances.csv")
    away_file = os.path.join(away_path, team+"_away_attendances.csv")
    home_df = pd.read_csv(home_file)
    away_df = pd.read_csv(away_file)
    #print(team, home_df["Attendance"].sum())
    #print(team, away_df["Attendance"].sum())
    home_df["Div"] = "E0"
    home_df["Team"] = team
    home_df["Team ID"] = "EPL"+str(EPL_teams.index(team))
    away_df["Div"] = "E0"
    away_df["Team"] = team
    away_df["Team ID"] = "EPL"+str(EPL_teams.index(team))
    home_all_df = home_all_df.append(home_df[cols], ignore_index=True)
    away_all_df = away_all_df.append(away_df[cols], ignore_index=True)

In [5]:
# Save the dataframes as csv files
home_all_df.to_csv(epl_path+"England_home_allmatches_att.csv")
away_all_df.to_csv(epl_path+"England_away_allmatches_att.csv")

## Cleaning and combine

* combine attendance data and score data

In [6]:
# Date string format
def Format_Date(df):
    # df is series with the key "Date"
    # expressed as month/day/year
    df2 = df.copy()
    one_digit = [str(i) for i in range(10)]
    print(df)
    for i, val in df.items():
        dates = val.split("/")
        print(dates)
        if dates[0] in one_digit:
            dates[0] = "0"+dates[0]
        if dates[1] in one_digit:
            dates[1] = "0"+dates[1]
        if dates[2] == "19" or dates[2] == "20":
            dates[2] = "20"+dates[2]
        df2.loc[i] = dates[2]+"-"+dates[1]+"-"+dates[0]
    return df2

In [7]:
att_df = home_all_df[["Team ID", "Date", "Opponent", "Attendance"]]
scores_df = pd.read_csv(score_file)
scores_df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,HomeScore,AwayScore,Result,HomeTeam ID,AwayTeam ID,City
0,E0,9/8/19,20:00,Liverpool,Norwich,4,1,H,EPL9,EPL13,Liverpool
1,E0,10/8/19,12:30,West Ham,Man City,0,5,A,EPL18,EPL10,London
2,E0,10/8/19,15:00,Bournemouth,Sheffield United,1,1,D,EPL2,EPL14,Bournemouth
3,E0,10/8/19,15:00,Burnley,Southampton,3,0,H,EPL4,EPL15,Burnley
4,E0,10/8/19,15:00,Crystal Palace,Everton,0,0,D,EPL6,EPL7,London


In [8]:
#Convert 'Date' column to unified format
scores_df["Date"] = Format_Date(scores_df["Date"])
att_df["Date"] = Format_Date(att_df["Date"])

0          9/8/19
1         10/8/19
2         10/8/19
3         10/8/19
4         10/8/19
          ...    
345       12/7/20
346       12/7/20
347       12/7/20
348       12/7/20
349    13/07/2020
Name: Date, Length: 350, dtype: object
['9', '8', '19']
['10', '8', '19']
['10', '8', '19']
['10', '8', '19']
['10', '8', '19']
['10', '8', '19']
['10', '8', '19']
['11', '8', '19']
['11', '8', '19']
['11', '8', '19']
['17', '08', '2019']
['17', '08', '2019']
['17', '08', '2019']
['17', '08', '2019']
['17', '08', '2019']
['17', '08', '2019']
['17', '08', '2019']
['18', '08', '2019']
['18', '08', '2019']
['19', '08', '2019']
['23', '08', '2019']
['24', '08', '2019']
['24', '08', '2019']
['24', '08', '2019']
['24', '08', '2019']
['24', '08', '2019']
['24', '08', '2019']
['25', '08', '2019']
['25', '08', '2019']
['25', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '2019']
['31', '08', '201

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
# Groupby team: We will need to merge on Date.
# But, the score frame has several same values on Date column.
# To have unique values, we work on each team.
gp_scores = scores_df.groupby("HomeTeam ID")
gp_att = att_df.groupby("Team ID")

# The number of teams in this league
L = len(gp_scores.indices)

# For each team, merge two dataframes on Date and then append them.
scores_att = pd.DataFrame()
for i in range(L):
    gp_id = "EPL"+str(i)
    sc_df = gp_scores.get_group(gp_id)
    at_df = gp_att.get_group(gp_id)
    all_df = pd.merge(sc_df, at_df, on="Date", how="left")
    scores_att = scores_att.append(all_df)

In [10]:
scores_att.sort_values(by="Date", inplace=True)

In [11]:
scores_att.to_csv(output_file, index=False)