In [None]:
def first_clean(df):
    '''First part of cleaning after importing data'''
    
    df.drop(df.loc[df['Player'].str.contains('Players')].index,inplace=True)
    df.drop(columns=['Won%','Cmp%','Int_y'],inplace=True)
    df.Attendance= df.Attendance.str.replace(',','')
    
    return df


def extract_teams_overall(teamlist,df,fifa_df):
    '''Extract list of teams data from fifa df'''
    teamstats = pd.DataFrame()
    for team in teamlist:
        fixtures = df.loc[(df['Home'] == team) | (df['Away'] == team)]
        team_players = fixtures['Player'].value_counts()[:18]
        team_players = list(team_players.index)

        jersey_num = list(fixtures.loc[fixtures['Player'].isin(team_players)]['#'])
        jersey_num = list(set(jersey_num))
        fifateam = fifa_df.loc[(fifa_df['club_name'] == team)]
    
        for j in jersey_num:
            oh = fifateam.loc[(fifateam['team_jersey_number'] == j)]
            teamstats = teamstats.append(oh)
            
    teamstats = teamstats.sort_values(by='club_name')
    teamstats = teamstats.reset_index(drop=True)
    teamstats.drop_duplicates(ignore_index=True,inplace=True)

    return teamstats

def combine_teams(teamstats):
    '''Creating team stats from combining players in fifa df'''
    teamstats_a = teamstats.groupby('club_name',sort=False,as_index=False)['value_eur','wage_eur'].sum()
    teamstats_b =  teamstats.groupby('club_name',sort=False,as_index=False)['age', 'dob', 'height_cm', 'weight_kg',
       'nationality', 'overall','potential','international_reputation','skill_moves'].mean()
    
    teamstats1 = pd.merge(teamstats_a,round(teamstats_b,2),left_index=True,right_index=True)
    teamstats1.drop(columns='club_name_y',inplace=True)
    
    teamstats1 = teamstats1.rename(columns = {'club_name_x':'club','value_eur':'total_value_eur','wage_eur':'total_wage_eur',
                           'age':'average_age','height_cm':'average_height_cm','weight_kg':'average_weight_kg'})
    
    return teamstats1

In [None]:
def combine_fixtures(df_list):
    '''combine all fixtures dataframes'''
    combined_fixtures = pd.concat(df_list,axis=0,ignore_index=True)
    combined_fixtures['Date'] = pd.to_datetime(combined_fixtures['Date'])
    combined_fixtures = combined_fixtures.sort_values(['Date','Match','Home','Away'])
    
    return combined_fixtures

def average_last_5_stats(features,df):
    '''
    Description: Its picks a variable(str) and generates the 5 last games mean of this variable until the last game played.
    
    Input:
        - Variable (Str)
    Output: 
        - Last 5 games Variable Mean until the new game (Int)    
    '''
    #last_5_avg = pd.DataFrame()
    
    for f in features:
        feature = (df.groupby('Home')[f].
                   transform(lambda row: row.rolling(5, min_periods=1).
                             mean()
                             .shift(1))) # Shift the data down 1 so we don't leak data
        
        df["last_5_avrg_"+f] = feature # Add the new feature to the DataFrame

In [None]:
def create_fixtures(df):
    '''Combine '''

    combined_df_new = df.groupby(['Date','Match','Home','Away','Stadium','Attendance'],sort=False,as_index=False)['Gls', 'Ast', 'PK', 'PKatt',
           'Sh', 'SoT', 'CrdY', 'CrdR', 'Touches', 'Press', 'Tkl', 'Int_x',
           'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA', 'Cmp', 'Att', 'Prog',
           'Carries', 'Prog.1', 'Succ', 'Att.1', 'Home', 'Away', 'Match', 'Date',
           'Stadium', 'Attendance', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs','TklW', 
            'PKwon', 'PKcon', 'OG', 'Recov', 'Won', 'Lost'].sum()
    
    combined_df_new.drop_duplicates(inplace=True)
    
    for x in range(len(combined_df_new.Stadium)):
        if combined_df_new['Attendance'][x].isnumeric() == False:
            combined_df_new['Stadium'][x] = combined_df_new['Attendance'][x]
            combined_df_new['Attendance'][x] = 0
    
    return combined_df_new

In [None]:
def combine_home_away(df):
    '''splitting away and home'''
    away = df.iloc[1::2]
    home = df.iloc[::2]
    combined_1 = pd.merge(home,away,how='outer',on = ['Date','Match'],)
    
    combined_2 = combined_1.sort_values('Date')
    combined_2.columns = map(str.lower, combined_2.columns)
    combined_2.drop(columns=['away_y','away_x','attendance_y','stadium_y'],inplace= True)

    return combined_2

In [None]:
def add_result(df):
    '''Add result class into df'''
    result=[]
    for i in df['match']:
        i = str(i)
        match = df.loc[df["match"]==i]

        for a in list(match.index):
            if match.loc[a]['h_gls'] + match.loc[a]['a_og'] > match.loc[a]['a_gls'] + match.loc[a]['h_og']:
                #test3[test3.index == a]['result'] = 0
                df.loc[a,'result'] = 0
            elif match.loc[a]['h_gls'] + match.loc[a]['a_og'] < match.loc[a]['a_gls'] + match.loc[a]['h_og']:
                #test3[test3.index == a]['result'] = 2
                df.loc[a,'result'] = 2
            else:
                #test3[test3.index == a]['result'] = 1
                df.loc[a,'result'] = 1
                
    df['result'] = df['result'].astype(int)
    df.sort_values(by='date',inplace=True,ignore_index=True)
    return df

In [None]:
def match_goals(df):
    #df['h_total_gls'] = 0
    #df['a_total_gls'] = 0
    h_list = []
    a_list = []
    
    for m in list(df.index):
    
        hg = int(df.iloc[m]['h_gls']) + int(df.iloc[m]['a_og'])
        ag = int(df.iloc[m]['a_gls']) + int(df.iloc[m]['h_og'])
    
        h_list.append(hg)
        a_list.append(ag)
    
    h_list = pd.DataFrame(h_list)
    a_list = pd.DataFrame(a_list)

    df["h_total_goals"] = h_list
    df["a_total_goals"] = a_list
    df['total_goals'] = df['h_total_goals'] + df['a_total_goals']
    

    
def matchtilldate(team,df):
    team_new = df.loc[(df['home'] == team) | (df['away'] == team)]
    zipped = zip(team_new['result'][:-1], team_new['home'][:-1],team_new['away'][:-1],team_new['h_gls'][:-1],
                 team_new['a_gls'][:-1],team_new['h_og'][:-1],team_new['a_og'][:-1])

    lastmatchplayed = []
    lastgoalsscored = []
    lastgoalsconceded = []
    lastmatchwon = []
    lastmatchlost = []
    lastmatchdrawn = []
    
    matchplayed = 0
    goalsscored = 0
    goalsconceded = 0
    matchwon = 0
    matchlost = 0
    matchdrawn = 0

    lastmatchplayed.insert(0,0)
    lastgoalsscored.insert(0,0)
    lastgoalsconceded.insert(0,0)
    lastmatchwon.insert(0,0)
    lastmatchlost.insert(0,0)
    lastmatchdrawn.insert(0,0)

    for r,h,a,hg,ag,ho,ao in zipped:
        if (r == 0) & (h == team):
            goalsscored = goalsscored + hg + ao
            goalsconceded = goalsconceded + ag + ho
            matchwon = matchwon + 1
            matchplayed = matchplayed + 1
        
        elif (r == 0) & (h != team): 
            matchlost = matchlost + 1
            goalsscored = goalsscored + ag + ho
            goalsconceded = goalsconceded + hg + ao
            matchplayed = matchplayed + 1
        
        elif (r == 2) & (a == team):
            matchwon = matchwon + 1
            goalsscored = goalsscored + ag + ho
            goalsconceded = goalsconceded + hg + ao
            matchplayed = matchplayed + 1
        
        elif (r == 2) & (a != team):
            matchlost = matchlost + 1
            goalsscored = goalsscored + hg + ao
            goalsconceded = goalsconceded + ag + ho
            matchplayed = matchplayed + 1
        elif (r == 1) & (h == team):
            matchplayed = matchplayed + 1
            matchdrawn = matchdrawn + 1
            goalsscored = goalsscored  + hg + ao
            goalsconceded = goalsconceded + ag + ho
        else:
            matchplayed = matchplayed + 1
            matchdrawn = matchdrawn + 1
            goalsscored = goalsscored  + ag + ho
            goalsconceded = goalsconceded + hg + ao
        
        lastmatchplayed.append(matchplayed)
        lastgoalsscored.append(goalsscored)
        lastgoalsconceded.append(goalsconceded)
        lastmatchwon.append(matchwon)
        lastmatchlost.append(matchlost)
        lastmatchdrawn.append(matchdrawn)
        
        #last6 = (''.join(last5[-5:]))
        #print(last6)
        #last.append(last6)
        #lastpts.append(last5pts)
        #print(last[-1])  
        #return last
        #result = pd.DataFrame(np.array(last))
        for t,p,w,l,d,s,c in zip(list(team_new.index),lastmatchplayed,lastmatchwon,lastmatchlost,lastmatchdrawn,lastgoalsscored,lastgoalsconceded):
            if df.iloc[t]['home'] == team:
                df.loc[t,'h_matchplayed'] = p
                df.loc[t,'h_matchwon'] = w
                df.loc[t,'h_matchlost'] = l
                df.loc[t,'h_matchdrawn'] = d
                df.loc[t,'h_goalsscored'] = s
                df.loc[t,'h_goalsconceded'] = c
            else:
                df.loc[t,'a_matchplayed'] = p
                df.loc[t,'a_matchwon'] = w
                df.loc[t,'a_matchlost'] = l
                df.loc[t,'a_matchdrawn'] = d
                df.loc[t,'a_goalsscored'] = s
                df.loc[t,'a_goalsconceded'] = c

In [None]:
def combine_fixtures_fifa_data(date1,date2,fifa_df,fixtures_data):
    fixtures = fixtures_data.loc[(fixtures_data['date']>date1)&(fixtures_data['date']<=date2)]
    home = list(fixtures.home)
    away = list(fixtures.away)
    
    homestats = pd.DataFrame()
    for i in home:
        hs = fifa_df.loc[fifa_df['club'] == i]
        homestats = homestats.append(hs)
        
    awaystats = pd.DataFrame()
    for i in away:
        ast = fifa_df.loc[fifa_df['club'] == i]
        awaystats = awaystats.append(ast)
        
    fixtures_test = pd.merge(fixtures.reset_index(drop=True),homestats.reset_index(drop=True),left_index=True,right_index=True)
    fixtures_final = pd.merge(fixtures_test.reset_index(drop=True),awaystats.reset_index(drop=True),left_index=True,right_index=True)
    
    return fixtures_final

In [None]:
def head_to_head(x,y,df):
    df1 = df.loc[(df['home'] == x) & (df['away'] == y)]
    df2 = df.loc[(df['home'] == y) & (df['away'] == x)]
    df1 = df1.append(df2).sort_values('date',ascending=False)
    return df1