In [1]:
import numpy as np
import os
import json 

In [13]:
# Iterates through all the files and subfolders in open-data/data/matches folder and store the data in an array
matches = []

for root, dirs, files in os.walk("open-data/data/matches"):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file), 'r') as f:
                content = json.load(f)
                for match in content:
                    matches.append(match)

2886


In [14]:
# Number of matches in the dataset
len(matches)

2886

In [15]:
# Number of unique matches in the dataset
len(set([match['match_id'] for match in matches]))

2886

In [16]:
# Number of unique competitions in the dataset
len(set([match['competition']['competition_id'] for match in matches]))

19

In [18]:
# Number of unique seasons in the dataset
len(set([match['season']['season_id'] for match in matches]))

44

In [19]:
# Number of unique teams in the dataset
teams = set()
for match in matches:
    teams.add(match['home_team']['home_team_id'])
    teams.add(match['away_team']['away_team_id'])
len(teams)

276

In [22]:
# Count of each scoreline in the dataset
scorelines = {}
for match in matches:
    scoreline = str(match['home_score']) + '-' + str(match['away_score'])
    if scoreline in scorelines:
        scorelines[scoreline] += 1
    else:
        scorelines[scoreline] = 1

# Sort the scorelines by count
sorted_scorelines = {k: v for k, v in sorted(scorelines.items(), key=lambda item: item[1], reverse=True)}
sorted_scorelines

{'1-1': 285,
 '1-0': 259,
 '2-1': 244,
 '2-0': 216,
 '0-1': 208,
 '1-2': 180,
 '0-0': 180,
 '0-2': 141,
 '2-2': 132,
 '3-1': 127,
 '3-0': 119,
 '1-3': 90,
 '0-3': 83,
 '4-0': 65,
 '3-2': 61,
 '4-1': 54,
 '0-4': 53,
 '2-3': 50,
 '5-0': 39,
 '5-1': 34,
 '1-4': 33,
 '3-3': 33,
 '4-2': 26,
 '0-5': 24,
 '2-4': 20,
 '6-0': 17,
 '6-1': 16,
 '1-5': 14,
 '5-2': 13,
 '0-6': 8,
 '3-4': 7,
 '4-3': 7,
 '6-2': 6,
 '1-6': 4,
 '7-0': 4,
 '0-8': 3,
 '2-5': 3,
 '7-1': 3,
 '8-0': 3,
 '0-7': 3,
 '4-4': 2,
 '3-5': 2,
 '4-5': 2,
 '1-7': 2,
 '6-4': 1,
 '2-6': 1,
 '10-2': 1,
 '8-2': 1,
 '5-3': 1,
 '13-0': 1,
 '3-6': 1,
 '9-0': 1,
 '8-1': 1,
 '1-9': 1,
 '11-1': 1}

In [23]:
# Number of draws, home wins and away wins in the dataset
draws = 0
home_wins = 0
away_wins = 0

for match in matches:
    if match['home_score'] == match['away_score']:
        draws += 1
    elif match['home_score'] > match['away_score']:
        home_wins += 1
    else:
        away_wins += 1

draws, home_wins, away_wins

(632, 1321, 933)

In [26]:
# Number of matches in each competition
competition_matches = {}

for match in matches:
    competition_id = match['competition']['competition_id']
    if competition_id in competition_matches:
        competition_matches[competition_id] += 1
    else:
        competition_matches[competition_id] = 1

# Sort the competitions by number of matches
sorted_competition_matches = {k: v for k, v in sorted(competition_matches.items(), key=lambda item: item[1], reverse=True)}

# Replace ids with names, they are stored in match['competition']['competition_name']
competition_names = {}
for match in matches:
    competition_id = match['competition']['competition_id']
    competition_name = match['competition']['competition_name']
    competition_names[competition_id] = competition_name
    
sorted_competition_matches = {competition_names[k]: v for k, v in sorted_competition_matches.items()}
sorted_competition_matches

{'La Liga': 868,
 'Premier League': 418,
 'Serie A': 381,
 "FA Women's Super League": 326,
 '1. Bundesliga': 306,
 'FIFA World Cup': 147,
 "Women's World Cup": 116,
 'Indian Super league': 115,
 'Ligue 1': 58,
 'UEFA Euro': 51,
 'NWSL': 36,
 "UEFA Women's Euro": 31,
 'Champions League': 17,
 'Major League Soccer': 6,
 'UEFA Europa League': 3,
 'Copa del Rey': 3,
 'Liga Profesional': 2,
 'North American League': 1,
 'FIFA U20 World Cup': 1}

In [27]:
# Statistics for each team
team_stats = {}

for match in matches:
    home_team_id = match['home_team']['home_team_id']
    away_team_id = match['away_team']['away_team_id']
    
    if home_team_id not in team_stats:
        team_stats[home_team_id] = {'team_name': match['home_team']['home_team_name'], 'matches': 0, 'wins': 0, 'draws': 0, 'losses': 0, 'goals_scored': 0, 'goals_conceded': 0}
    if away_team_id not in team_stats:
        team_stats[away_team_id] = {'team_name': match['away_team']['away_team_name'], 'matches': 0, 'wins': 0, 'draws': 0, 'losses': 0, 'goals_scored': 0, 'goals_conceded': 0}
    
    team_stats[home_team_id]['matches'] += 1
    team_stats[away_team_id]['matches'] += 1
    
    team_stats[home_team_id]['goals_scored'] += match['home_score']
    team_stats[home_team_id]['goals_conceded'] += match['away_score']
    team_stats[away_team_id]['goals_scored'] += match['away_score']
    team_stats[away_team_id]['goals_conceded'] += match['home_score']
    
    if match['home_score'] == match['away_score']:
        team_stats[home_team_id]['draws'] += 1
        team_stats[away_team_id]['draws'] += 1
    elif match['home_score'] > match['away_score']:
        team_stats[home_team_id]['wins'] += 1
        team_stats[away_team_id]['losses'] += 1
    else:
        team_stats[home_team_id]['losses'] += 1
        team_stats[away_team_id]['wins'] += 1

# Sort the teams by number of matches
sorted_team_stats = {k: v for k, v in sorted(team_stats.items(), key=lambda item: item[1]['matches'], reverse=True)}
sorted_team_stats

{217: {'team_name': 'Barcelona',
  'matches': 532,
  'wins': 393,
  'draws': 86,
  'losses': 53,
  'goals_scored': 1409,
  'goals_conceded': 444},
 1: {'team_name': 'Arsenal',
  'matches': 76,
  'wins': 46,
  'draws': 23,
  'losses': 7,
  'goals_scored': 138,
  'goals_conceded': 62},
 220: {'team_name': 'Real Madrid',
  'matches': 71,
  'wins': 39,
  'draws': 13,
  'losses': 19,
  'goals_scored': 162,
  'goals_conceded': 97},
 212: {'team_name': 'Atlético Madrid',
  'matches': 68,
  'wins': 33,
  'draws': 12,
  'losses': 23,
  'goals_scored': 94,
  'goals_conceded': 84},
 213: {'team_name': 'Sevilla',
  'matches': 66,
  'wins': 15,
  'draws': 17,
  'losses': 34,
  'goals_scored': 78,
  'goals_conceded': 120},
 207: {'team_name': 'Valencia',
  'matches': 65,
  'wins': 14,
  'draws': 20,
  'losses': 31,
  'goals_scored': 76,
  'goals_conceded': 106},
 215: {'team_name': 'Athletic Club',
  'matches': 64,
  'wins': 19,
  'draws': 14,
  'losses': 31,
  'goals_scored': 78,
  'goals_conceded'