In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('cricket_data.csv')
df.head()

Unnamed: 0,ground,date,winner,host_country,home_team,away_team,neutral
0,Melbourne,"Jan 5, 1971",Australia,Australia,Australia,England,False
1,Manchester,"Aug 24, 1972",England,England,England,Australia,False
2,Lord's,"Aug 26, 1972",Australia,England,England,Australia,False
3,Birmingham,"Aug 28, 1972",England,England,England,Australia,False
4,Christchurch,"Feb 11, 1973",New Zealand,New Zealand,New Zealand,Pakistan,False


In [2]:
# Entities naming
df["match_id"] = df.index.values.astype(str)
df["match_id"] =  "Match" + df.match_id
df["ground_id"] = "Ground" + df.ground.str.title().str.replace(" ", "")
df["country_id"] = "Country" + df.host_country.str.title().str.replace(" ", "")
df["home_team_id"] = "Team" + df.home_team.str.title().str.replace(" ", "")
df["away_team_id"] = "Team" + df.away_team.str.title().str.replace(" ", "")
df["neutral"] = df.neutral.astype(str)

In [3]:
num_train = int(np.round(0.8 * len(df), 0)) # 80% of dataset
num_train

5891

In [4]:
df["train"] = df.index < num_train

In [5]:
df["train"].value_counts()

True     5891
False    1473
Name: train, dtype: int64

In [6]:
triples = []
for _, row in df[df["train"]].iterrows():
    # Home and away information
    home_team = (row["home_team_id"], "isHomeTeamIn", row["match_id"])
    away_team = (row["away_team_id"], "isAwayTeamIn", row["match_id"])
    
    # Match results
    if row["winner"] == row["home_team"]:
        score_home = (row["home_team_id"], "winnerOf", row["match_id"])
        score_away = (row["away_team_id"], "loserOf", row["match_id"])
    else:
        score_home = (row["away_team_id"], "winnerOf", row["match_id"])
        score_away = (row["home_team_id"], "loserOf", row["match_id"])
    
    # Match characteristics
    ground = (row["match_id"], "inGround", row["ground_id"])
    country = (row["match_id"], "inCountry", row["country_id"])
    neutral = (row["match_id"], "isNeutral", row["neutral"])
    year = (row["match_id"], "atYear", row["date"][-4:])
    
    triples.extend((home_team, away_team, score_home, score_away, 
                    ground, country, neutral, year))

In [7]:
triples_df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
triples_df.head()

Unnamed: 0,subject,predicate,object
0,TeamAustralia,isHomeTeamIn,Match0
1,TeamEngland,isAwayTeamIn,Match0
2,TeamAustralia,winnerOf,Match0
3,TeamEngland,loserOf,Match0
4,Match0,inGround,GroundMelbourne


In [8]:
match555 = triples_df[(triples_df.subject=="Match555") | (triples_df.object=="Match555")]
match555

Unnamed: 0,subject,predicate,object
4440,TeamIndia,isHomeTeamIn,Match555
4441,TeamSriLanka,isAwayTeamIn,Match555
4442,TeamIndia,winnerOf,Match555
4443,TeamSriLanka,loserOf,Match555
4444,Match555,inGround,GroundAhmedabad
4445,Match555,inCountry,CountryIndia
4446,Match555,isNeutral,False
4447,Match555,atYear,1989


In [9]:
df.head()

Unnamed: 0,ground,date,winner,host_country,home_team,away_team,neutral,match_id,ground_id,country_id,home_team_id,away_team_id,train
0,Melbourne,"Jan 5, 1971",Australia,Australia,Australia,England,False,Match0,GroundMelbourne,CountryAustralia,TeamAustralia,TeamEngland,True
1,Manchester,"Aug 24, 1972",England,England,England,Australia,False,Match1,GroundManchester,CountryEngland,TeamEngland,TeamAustralia,True
2,Lord's,"Aug 26, 1972",Australia,England,England,Australia,False,Match2,GroundLord'S,CountryEngland,TeamEngland,TeamAustralia,True
3,Birmingham,"Aug 28, 1972",England,England,England,Australia,False,Match3,GroundBirmingham,CountryEngland,TeamEngland,TeamAustralia,True
4,Christchurch,"Feb 11, 1973",New Zealand,New Zealand,New Zealand,Pakistan,False,Match4,GroundChristchurch,CountryNewZealand,TeamNewZealand,TeamPakistan,True


In [10]:
def helper(row):
    if row['home_team'] == row['winner']:
        val = "home_team_wins"
    else:
        val = "home_team_loses"
    return val

In [11]:
df['results'] = df.apply(helper, axis=1)

In [12]:
df.results.value_counts(normalize=True)

home_team_wins     0.556898
home_team_loses    0.443102
Name: results, dtype: float64

In [15]:
df.away_team_id.unique()

array(['TeamEngland', 'TeamAustralia', 'TeamPakistan', 'TeamNewZealand',
       'TeamWestIndies', 'TeamIndia', 'TeamSriLanka', 'TeamEastAfrica',
       'TeamCanada', 'TeamZimbabwe', 'TeamBangladesh', 'TeamSouthAfrica',
       'TeamU.A.E.', 'TeamKenya', 'TeamNetherlands', 'TeamScotland',
       'TeamNamibia', 'TeamHongKong', 'TeamU.S.A.', 'TeamIreland',
       'TeamBermuda', 'TeamAfghanistan', 'TeamP.N.G.'], dtype=object)

In [13]:
df.home_team_id.value_counts()

TeamAustralia      964
TeamIndia          855
TeamNewZealand     712
TeamPakistan       695
TeamSriLanka       678
TeamWestIndies     676
TeamEngland        656
TeamSouthAfrica    586
TeamZimbabwe       497
TeamBangladesh     373
TeamKenya          161
TeamIreland        123
TeamScotland        99
TeamCanada          77
TeamNetherlands     74
TeamU.A.E.          46
TeamAfghanistan     38
TeamBermuda         20
TeamHongKong        17
TeamNamibia          7
TeamP.N.G.           4
TeamU.S.A.           3
TeamEastAfrica       3
Name: home_team_id, dtype: int64

In [18]:
len(df.home_team_id.unique())
len(df.away_team_id.unique())

23

In [26]:
top20teams = df.home_team_id.unique().tolist()
top20teams

['TeamAustralia',
 'TeamEngland',
 'TeamNewZealand',
 'TeamEastAfrica',
 'TeamSriLanka',
 'TeamPakistan',
 'TeamIndia',
 'TeamWestIndies',
 'TeamCanada',
 'TeamBangladesh',
 'TeamSouthAfrica',
 'TeamZimbabwe',
 'TeamU.A.E.',
 'TeamNetherlands',
 'TeamKenya',
 'TeamScotland',
 'TeamNamibia',
 'TeamHongKong',
 'TeamBermuda',
 'TeamIreland',
 'TeamAfghanistan',
 'TeamP.N.G.',
 'TeamU.S.A.']

In [29]:
type(df.home_team_id.unique().tolist())

list

In [30]:
top20teams = ["TeamAustralia", "TeamEngland", "TeamIndia", "TeamPakistan",  "TeamSouthAfrica",
              "TeamNewZealand", "TeamSriLanka", "TeamBangladesh", "TeamWestIndies", "TeamAfghanistan",
              "TeamZimbabwe", "TeamIreland", "TeamU.A.E.", "TeamScotland", "TeamNepal", 
              "TeamP.N.G.", "TeamNetherlands", "TeamOman", "TeamNamibia", "TeamSingapore"]

In [31]:
type(top20teams)

list