In [209]:
import pandas as pd
import numpy as np
import pymysql
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 150)

In [5]:
connection = sqlite3.connect("/Users/danielgilberg/data_science/soccer_matches_project/database.sqlite")

In [6]:
c = connection.cursor()

In [7]:
match_query = '''SELECT * FROM match'''
match_df = pd.read_sql(match_query, connection)

In [8]:
country_query = '''SELECT * FROM country'''
country_df = pd.read_sql(country_query, connection)

In [9]:
league_query = '''SELECT * FROM league'''
league_df = pd.read_sql(league_query, connection)

In [10]:
team_query = '''SELECT * FROM team'''
team_df = pd.read_sql(team_query, connection)

In [11]:
player_query = '''SELECT * FROM player'''
player_df = pd.read_sql(player_query, connection)

In [12]:
player_atts_query = '''SELECT * FROM Player_Attributes'''
player_atts_df = pd.read_sql(player_atts_query, connection)

In [14]:
team_atts_query = '''SELECT * FROM team_attributes'''
team_atts_df = pd.read_sql(team_atts_query, connection)

In [363]:
class soccerData(object):
    def __init__(self, match_query, country_df, team_df, player_df, team_atts_df, player_atts_df, league_df):
        self.match_df = match_df
        self.country_df = country_df
        self.team_df = team_df
        self.teams = team_df.team_long_name.unique()
        self.player_df = player_df
        self.team_atts_df = team_atts_df
        self.player_atts_df = player_atts_df
        self.league_df = league_df
        
    def get_scores(self):
        home_team = temp.match_df[["id", "home_team_api_id", "home_team_goal"]]
        away_team = temp.match_df[["id", "away_team_api_id", "away_team_goal"]]
        teams = temp.team_df[["team_api_id", "team_long_name"]]
        home_df = pd.merge(home_team, teams,left_on="home_team_api_id", right_on="team_api_id")
        home_df.rename(columns={"team_long_name": "home_team"}, inplace=True)
        home_df.drop(["team_api_id", "home_team_api_id"], axis=1, inplace=True)
        away_df = pd.merge(away_team, teams,left_on="away_team_api_id", right_on="team_api_id")
        away_df.rename(columns={"team_long_name": "away_team"}, inplace=True)
        away_df.drop(["away_team_api_id", "team_api_id"], axis=1, inplace=True)
        df = pd.merge(home_df, away_df,on="id")
        df["Winner"] = df.apply(lambda row: self.get_winner(row["home_team_goal"],
                                                           row["home_team"], row["away_team_goal"],
                                                           row["away_team"]), axis=1)
        df["Result"] = df.apply(lambda row: self.get_home_result(row["home_team_goal"],
                                                           row["home_team"], row["away_team_goal"],
                                                           row["away_team"]), axis=1)
        other_info = self.match_df[["id", "country_id", "league_id", "season", "date"]]
        df = pd.merge(df, other_info, on="id")
        return df
    

    def get_winner(self, home_score, home_team, away_score, away_team):
        if home_score > away_score:
            return home_team
        elif away_score > home_score:
            return away_team
        else:
            return "Draw"
        
    def get_home_result(self, home_score, home_team, away_score, away_team):
        if home_score > away_score:
            return "Home"
        elif away_score > home_score:
            return "Away"
        else:
            return "Draw"
        
    def get_home_win_pct_by_season(self):
        df = self.get_scores()
        test = df[df.Result == "Home"]
        home_win_count = test.groupby(["country_id", "season"], as_index=False).agg({"date": len})
        home_win_count.rename(columns={"date": "home_wins"}, inplace=True)
        game_count = df.groupby(["country_id", "season"], as_index=False).agg({"date": len})
        game_count.rename(columns={"date": "total_games"}, inplace=True)
        final = pd.merge(game_count, home_win_count, on=["country_id", "season"])
        final["home_win_pct"] = final.apply(lambda row: round(row["home_wins"]/row["total_games"], 3), axis=1)
        final = self.country_df.merge(final, left_on="id", right_on="country_id")
        final.drop(["id", "country_id"], axis=1, inplace=True)
        return final
    
    def get_win_pct_by_team(self):
        df = self.get_scores()
        teams = self.team_df.team_long_name.unique()
        rows = list()
        for team in teams:
            mask = (df.home_team == team) | (df.away_team==team)
            games = len(df[mask])
            wins = len(df[df.Winner == team])
            pct = round(wins/games, 3)
            row = [team, pct]
            rows.append(row)

        df = pd.DataFrame(rows, columns=["Team", "Win_Pct"])
        return df
    
    def get_win_pct_by_team_and_season(self):
        df = self.get_scores()
        teams = self.team_df.team_long_name.unique()
        rows = list()
        for team in teams:
            mask = (df.home_team == team) | (df.away_team==team)
            test = df[mask]
            seasons = test.season.unique()
            for season in seasons: 
                temp_df = test[test.season == season]
                games = len(temp_df)
                wins = len(temp_df[temp_df.Winner == team])
                pct = round(wins/games, 3)
                row = [team, season, pct]
                rows.append(row)
        final = pd.DataFrame(rows, columns= ["Team", "Season", "Win_Pct"])
        return final
    
    def get_team_home_goal_differential_by_season(self):
        arr = temp.match_df.home_team_api_id.unique()
        rows = list()
        for team in arr:
            df = temp.match_df[temp.match_df.home_team_api_id == team]
            df = df.groupby("season", as_index=False).agg({"home_team_goal": np.sum, "away_team_goal": np.sum, "date":len})
            df.rename(columns={"home_team_goal": "home_goals_scored", "away_team_goal": "home_goals_conceded",
                              "date": "home_matches_played"}, inplace=True)
            seasons = list(df.season)
            goals_for = list(df.home_goals_scored)
            goals_against = list(df.home_goals_conceded)
            matches_played = list(df.home_matches_played)
            for i in range(0, len(seasons)):
                row = [team, seasons[i], goals_for[i], goals_against[i], matches_played[i]]
                rows.append(row)


        df = pd.DataFrame(rows, columns=["Team_ID", "Season", "Home_Goals_For", "Home_Goals_Against", "Home_Matches_Played"])



        teams = temp.team_df.copy()[["team_api_id", "team_long_name"]]

        final  = pd.merge(teams, df, left_on="team_api_id", right_on="Team_ID")
        final.drop("Team_ID", axis=1, inplace=True)
        final.rename(columns={"team_long_name": "Team"}, inplace=True)
        final["Home_GD"] = final.apply(lambda row: row["Home_Goals_For"]-row["Home_Goals_Against"], axis=1)
        return final
    
    def get_team_away_goal_differential_by_season(self):
        arr = temp.match_df.home_team_api_id.unique()
        rows = list()
        for team in arr:
            df = temp.match_df[temp.match_df.away_team_api_id == team]
            df = df.groupby("season", as_index=False).agg({"home_team_goal": np.sum, "away_team_goal": np.sum, "date":len})
            df.rename(columns={"away_team_goal": "away_goals_scored", "home_team_goal": "away_goals_conceded",
                              "date": "away_matches_played"}, inplace=True)
            seasons = list(df.season)
            goals_for = list(df.away_goals_scored)
            goals_against = list(df.away_goals_conceded)
            matches_played = list(df.away_matches_played)
            for i in range(0, len(seasons)):
                row = [team, seasons[i], goals_for[i], goals_against[i], matches_played[i]]
                rows.append(row)


        df = pd.DataFrame(rows, columns=["Team_ID", "Season", "Away_Goals_For", "Away_Goals_Against", "Away_Matches_Played"])



        teams = temp.team_df.copy()[["team_api_id", "team_long_name"]]

        final  = pd.merge(teams, df, left_on="team_api_id", right_on="Team_ID")
        final.drop("Team_ID", axis=1, inplace=True)
        final.rename(columns={"team_long_name": "Team"}, inplace=True)
        final["Away_GD"] = final.apply(lambda row: row["Away_Goals_For"]-row["Away_Goals_Against"], axis=1)
        return final
    
    def get_goal_diff_df_by_team_and_season(self):
        away = temp.get_team_away_goal_differential_by_season()
        home = temp.get_team_home_goal_differential_by_season()
        df = pd.merge(home, away, on=["team_api_id", "Team","Season"])
        df["Goals_For"] = df.apply(lambda row: row["Home_Goals_For"] + row["Away_Goals_For"], axis=1)
        df["Goals_Against"] = df.apply(lambda row: row["Home_Goals_Against"] + row["Away_Goals_Against"], axis=1)
        df["Goal_Differential"] = df.apply(lambda row: row["Goals_For"] - row["Goals_Against"],axis=1)
        df["Matches_Played"] = df.apply(lambda row: row["Home_Matches_Played"] + row["Away_Matches_Played"], axis=1)
        df["Home_Away_Diff"] = df.apply(lambda row: row["Home_GD"] - row["Away_GD"], axis=1)
        df.drop(["Home_Goals_For", "Home_Goals_Against", "Home_Matches_Played", "Away_Goals_For",
       "Away_Goals_Against", "Away_Matches_Played"], axis=1, inplace=True)
        leagues = self.get_team_leagues()
        df = pd.merge(df, leagues, on="Team")
        return df
    
    def get_team_leagues(self):
        df1 = self.match_df[["league_id", "home_team_api_id"]].drop_duplicates()
        df1.rename(columns={"home_team_api_id": "team_api_id"}, inplace=True)
        df2 = self.league_df[["id", "name"]]
        df2.rename(columns={"id": "league_id"}, inplace=True)
        df3 = self.team_df[["team_api_id", "team_long_name"]]
        df3.rename(columns={"team_long_name": "Team"}, inplace=True)
        final = pd.merge(df1, df2, on="league_id")
        final = pd.merge(final, df3, on="team_api_id")
        final.rename(columns={"name": "League"}, inplace=True)
        return final[["League", "Team"]]
        
        
        
    


temp = soccerData(match_df, country_df, team_df, player_df, team_atts_df, player_atts_df, league_df)

In [364]:
a = temp.get_team_leagues()
b = temp.get_goal_diff_df_by_team_and_season()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [366]:
b.groupby()

Unnamed: 0,team_api_id,Team,Season,Home_GD,Away_GD,Goals_For,Goals_Against,Goal_Differential,Matches_Played,Home_Away_Diff,League
0,9987,KRC Genk,2008/2009,2,-5,48,51,-3,34,7,Belgium Jupiler League
1,9987,KRC Genk,2009/2010,3,-1,33,31,2,28,4,Belgium Jupiler League
2,9987,KRC Genk,2010/2011,22,15,64,27,37,30,7,Belgium Jupiler League
3,9987,KRC Genk,2011/2012,25,-9,60,44,16,30,34,Belgium Jupiler League
4,9987,KRC Genk,2012/2013,18,5,63,40,23,30,13,Belgium Jupiler League
5,9987,KRC Genk,2014/2015,13,-3,38,28,10,30,16,Belgium Jupiler League
6,9987,KRC Genk,2015/2016,16,-4,42,30,12,30,20,Belgium Jupiler League
7,9993,Beerschot AC,2008/2009,13,-11,44,42,2,34,24,Belgium Jupiler League
8,9993,Beerschot AC,2009/2010,-2,-11,30,43,-13,28,9,Belgium Jupiler League
9,9993,Beerschot AC,2010/2011,4,-20,24,40,-16,30,24,Belgium Jupiler League


In [308]:
temp.team_df.columns

Index(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name',
       'team_short_name'],
      dtype='object')

In [306]:
temp.league_df

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [305]:
temp.match_df[["league_id", "home_team_api_id"]].drop_duplicates()

Unnamed: 0,league_id,home_team_api_id
0,1,9987
1,1,10000
2,1,9984
3,1,9991
4,1,7947
5,1,8203
6,1,9999
7,1,4049
8,1,10001
9,1,8342
