# FUMBBL data report

Report a few analyses. with types of matches etc, star player usage etc. 



## Read the FUMBBL data

In [None]:
import pandas as pd
import numpy as np
import plotnine as p9

from mizani.formatters import date_format

# point this to the location of the HDF5 datasets
path_to_datasets = '../datasets/v0.8/'

# FUMBBL matches
target = 'df_matches.csv'
df_matches = pd.read_csv(path_to_datasets + target) 

# FUMBBL matches by team
target = 'df_mbt.csv'
df_mbt = pd.read_csv(path_to_datasets + target) # first column is row_index PM FIX

# FUMBBL inducements
target = 'inducements.csv'
inducements = pd.read_csv(path_to_datasets + target) 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


# Fix and enrich data

In [None]:
# FUMBBL quartely volumes
target = 'fumbbl_match_counts.csv'
fumbbl_volumes = pd.read_csv(target, sep = ';') 
fumbbl_volumes['quarter_date'] = fumbbl_volumes['year'].astype(str) + '-Q' + fumbbl_volumes['quarter'].astype(str)
fumbbl_volumes['quarter_date'] = pd.to_datetime(fumbbl_volumes['quarter_date'])

In [None]:
df_matches['match_date'] = pd.to_datetime(df_matches['match_date'])
df_matches['week_date'] = pd.to_datetime(df_matches['week_date'])


df_matches['quarter'] = df_matches['match_date'].dt.to_period('Q')
df_matches['month'] = df_matches['match_date'].dt.to_period('M')
df_matches['quarter_date'] = pd.PeriodIndex(df_matches['quarter'] , freq='Q').to_timestamp()
df_matches['month_date'] = pd.PeriodIndex(df_matches['month'] , freq='M').to_timestamp()

df_matches.loc[df_matches['scheduler'].str.contains("Blackbox", na=False), 'division_name'] = 'Blackbox'

df_matches['cr_diff2_bin'] = pd.cut(df_matches['cr_diff2'], bins = [-1*float("inf"), -30, -20, -10, -5, 5, 10, 20, 30, float("inf")], 
 labels=['{-Inf,-30]', '[-30,-20]', '[-20,-10]', '[-10,-5]', '[-5,5]', '[5,10]', '[10,20]', '[20,30]', '[30,Inf]']) 

df_mbt['match_date'] = pd.to_datetime(df_mbt['match_date'])
df_mbt['quarter'] = df_mbt['match_date'].dt.to_period('Q')
df_mbt['month'] = df_mbt['match_date'].dt.to_period('M')
df_mbt['quarter_date'] = pd.PeriodIndex(df_mbt['quarter'] , freq='Q').to_timestamp()
df_mbt['month_date'] = pd.PeriodIndex(df_mbt['month'] , freq='M').to_timestamp()

In [None]:
# draw 10 random matches from blackbox and competitive in july 2024
divisions = ['Blackbox', 'Competitive']

match_ids = df_matches.query('month_date == "2024-07-01" & division_name in @divisions')['match_id'].tolist()
import random

random.seed(12345)
             
test_id = random.sample(match_ids, 1)

#match_ids

inducements.query('match_id == @test_id')


In [None]:
df_matches.query('match_id == @test_id').iloc[0]

# What data do we have? Weekly game volumes


Let's see what we've got! The pandas DataFrame `df_matches` contains records for all matches played on FUMBBL from august 2020 onwards.

Since we have a proper `datetime` type variable for each week (`week_date`), we can use `pandas` and `plotnine` to plot the weekly game volume as a time series.

The introduction of the new **Competitive division** with BB2020 rules is marked by a vertical red line. Also the World cup in Alicante is marked.

In [None]:
res = (df_matches
    .loc[(df_matches['week_date'] >= '2020-08-01' ) & (df_matches['week_date'] < '2025-03-25')]
    .groupby(['quarter_date']) #  'division_name'
    .agg(        
        n_games_api = ('match_id', "count") 
    )
    .reset_index()) # this adds the "group by" variables back as columns of res

res = fumbbl_volumes.merge(res, how = 'right', on = 'quarter_date')

(p9.ggplot(data = res, mapping = p9.aes(x = 'quarter_date', y = 'n_games'))
+ p9.geom_point(size = 3) 
+ p9.geom_point(mapping = p9.aes(y = 'n_games_api'), color = "red")
+ p9.geom_line()
+ p9.expand_limits(y=[0,2000])
+ p9.geom_vline(xintercept = '2021-09-01', color = "red") # BB2020 ruleset on FUMBBL
+ p9.geom_vline(xintercept = '2023-09-07', color = "red") # world cup
+ p9.theme(figure_size = (10, 5))
+ p9.ggtitle("quartely game volume on FUMBBL since august 2020"))

next go to divisions, go to weekly resolution. Competitive, Blackbox, "Other".
Ranked changed to Competitive. 


The blackbox trophy (BBT) is a lengthy competitive division meta event. Coaches select a squad of 4 unique teams with a budget of 7 points.

Blackbox (BBT 1-5) was off during july 2021/juli 2022.
Season 6 started in August 2022.

In [None]:
res = (df_matches
    .loc[(df_matches['week_date'] >= '2020-08-01' ) & (df_matches['week_date'] < '2025-03-25')]
    .groupby(['week_date', 'division_name', 'division_id']) #  'division_name'
    .agg(        
        n_games_api = ('match_id', "count") 
    )
    .reset_index()) # this adds the "group by" variables back as columns of res

(p9.ggplot(data = res, mapping = p9.aes(x = 'week_date', y = 'n_games_api', color = 'division_name'))
+ p9.geom_point(size = 3) 
+ p9.geom_line()
+ p9.expand_limits(y=[0,2000])
+ p9.geom_vline(xintercept = '2021-09-01', color = "red")
+ p9.geom_vline(xintercept = '2023-09-07', color = "black") # world cup
+ p9.theme(figure_size = (10, 5))
+ p9.ggtitle("quartely game volume on FUMBBL august 2020 - august 2024"))

In [None]:
res

# Star player usage over time 



In [None]:
divisions = ['Blackbox', 'Competitive', 'Ranked', 'League']

res = (df_matches
.query("division_name in @divisions")
.groupby(['week_date'])
.agg(
    n_games = ('match_id', 'count'),
    perc_sp = ('has_sp', 'mean')
)
.reset_index()
.sort_values("week_date", ascending=False)
)
# 'division_name', 
(p9.ggplot(data = res.query("n_games > 30"), mapping = p9.aes(x = 'week_date', y = 'perc_sp*100'))#, 
#group = 'factor(division_name)', color = 'factor(division_name)'))
    + p9.geom_point(p9.aes(size = 'n_games')) 
    + p9.expand_limits(y=[0,1])
    + p9.scale_size_area()
    + p9.geom_vline(xintercept = '2021-09-01', color = "red")
    + p9.geom_vline(xintercept = '2022-08-01', color = "black") # BBT6
    + p9.geom_vline(xintercept = '2023-01-01', color = "black")
    + p9.geom_vline(xintercept = '2023-05-01', color = "black")
    + p9.geom_vline(xintercept = '2023-09-01', color = "black")
    + p9.geom_vline(xintercept = '2024-01-01', color = "black")
    + p9.geom_vline(xintercept = '2024-05-01', color = "black") # BBT11
    + p9.geom_vline(xintercept = '2024-09-01', color = "black") # BBT12
    + p9.geom_vline(xintercept = '2025-01-01', color = "black") # BBT13
    + p9.ggtitle("Star player usage over time")
    + p9.theme(figure_size = (10, 6))
    + p9.ylab("% matches with at least one Star Player"))

# Star Players

So on a weekly basis 20-25% of matches is with one or more star players.
With a weekly volume of 1000-2000 this means roughly 200 star players a week. 
We also have the teams: divide by 30. Around 10 per team per week.

In [None]:
(inducements
.query('star_player == 1')
.groupby('inducements')
.agg( n_teams = ('star_player', 'count'))
.reset_index()
.sort_values("n_teams", ascending=False)
)[0:15]

# Are coach ratings predictive of match outcomes?

For the main divisions on FUMBBL, ELO style coach ratings are available that are updated after each game.
The coach rankings are explained on [this help page](https://fumbbl.com/help:Ranking).

According to the ELO ranking system, a coach rating difference of 40 should result in 85% wins for the higher ranked coach.
Coaches of equal rating should have a win rate of 0.5 (with draws weighted at half point).

The range of coach rankings observed for a particular game tells us something about the relationship between skill and luck.
If a game is pure luck, we will never observe large differences in coach rating, since the outcome will be determined by a coin flip, independent of coach skill. 

On FUMBBL, coach ratings vary roughly between 125 and 175. What do we expect if a coach with a rating of 175 plays a coach of rating 145? Well, the rating difference is 30. According to the formula (assuming equal team strength and equal races), the expected win probability is 1/(1 + 10^0.75) = 85%, and the probability of loss is 15%.

Since our CR we obtained from the FUMBBL match result page is an overall coach rating (i.e. it ignores division), we can simply pool all matches from divisions where coach rating is tracked.

The match data contains a **Coach Ranking Difference** bin (category) that we can each to calculate the Win/draw/loss percentages for each category.

Let's see what the actual percentages are:

In [None]:
main_divisions = ['Blackbox', 'Ranked', 'Competitive']

res = (df_matches[df_matches['division_name'].isin(main_divisions)]
    .query('match_date < "2024-03-28"') # has a clear effect, new CR less predictive?
    .groupby(['cr_diff2_bin', 'team1_win'])
    .agg(        
        n_games = ('cr_diff2_bin', "count"),
    )
    .sort_values(by=['cr_diff2_bin'])
    .reset_index()) # this adds the group by variable (now index) as a column

# add total games played within each bin
res['n_games_bin'] = res.groupby('cr_diff2_bin').n_games.transform('sum')

res['perc'] = res['n_games']/res['n_games_bin']

(p9.ggplot(res, p9.aes(x = 'factor(cr_diff2_bin)', y = 'perc', fill = 'factor(team1_win)')) 
    + p9.geom_bar(position = "fill", stat = "identity") 
    + p9.theme(axis_text_x= p9.element_text(rotation=90, hjust=1))
    + p9.ggtitle('probability of win/draw/loss as a function of Coach Rating difference')
)

In [None]:
main_divisions = ['Blackbox',  'Competitive']

df_matches["month_date_Cat"] = df_matches["month_date"].astype("category").astype("string") # work around 2 bugs in plotnine

res = (df_matches[df_matches['division_name'].isin(main_divisions)]
    .query('match_date > "2021-12-31" & match_date < "2022-09-01"') # has a clear effect, new CR less predictive?
    .groupby(['division_name', 'month_date_Cat', 'cr_diff2_bin'])
    .agg(        
        n_games = ('cr_diff2_bin', "count"),
    )
    .sort_values(by=['month_date_Cat'])
    .reset_index()) # this adds the group by variable (now index) as a column


(p9.ggplot(res, p9.aes(x = 'factor(cr_diff2_bin)', y = 'n_games', group = 'factor(division_name)', fill = 'factor(division_name)')) 
    + p9.geom_bar(stat = 'identity', position = 'dodge')
    + p9.facet_wrap('month_date_Cat', ncol = 4)
    + p9.theme(axis_text_x= p9.element_text(rotation=90, hjust=1))
    + p9.ggtitle('dist of matches by Coach Rating difference')
)

In [None]:
(df_matches.query('month_date == "2022-05-01" & cr_diff2_bin == "[-5,5]"')
            .filter([ 'match_id', 'team1_coach_id', 'coach1_ranking', 'coach2_ranking', 'cr_diff2_bin'])
 ) # expect these CRS to have been changed now

In [None]:
# Volkajo & cr_diff2_bin == "[30,Inf]"

(df_matches.query('month_date in ["2022-05-01", "2022-06-01"]   & team1_coach_id == 34520')
            .filter([ 'match_id', 'match_date', 'coach1_ranking', 'coach2_ranking', 'division_name', 'cr_diff2_bin'])
 ) # expect these CRS to have been changed now

In [None]:
df_mbt.query('match_id == 4380230')

start checking change in CR. plot CR for indiv coaches. add coach name CR to df_mbt. also add legend etc categories.

# which race scores the most touchdowns and have them scored against them

In [None]:
divisions = ['Competitive']

tv_bins = ['1.1M', '1.4M', '1.7M']

res = (df_mbt[df_mbt['division_name'].isin(divisions)]
    .loc[df_mbt['tv_bin'].isin(tv_bins)]
    .query("mirror_match == 0 & has_sp == 0 & tv_bin in @tv_bins & division_name in @divisions")
    .groupby(['race_name', 'tv_bin'])
    .agg(        
        avg_td = ('team_score', "mean"),
        avg_away_td = ('away_team_score', "mean"),
        n_games = ('race_name', "count")
    )
    .sort_values( 'n_games', ascending = False)
    .reset_index()) # this adds the group by variables (now index) as a column

res = res.dropna()

(p9.ggplot(data = res.query('n_games > 100'), 
            mapping = p9.aes(y = 'reorder(race_name, avg_away_td)', x = 'avg_away_td', 
                            size = 'n_games', group = 'factor(tv_bin)', 
                            color = 'factor(tv_bin)'))
    + p9.geom_point()
    + p9.scale_size_area() 
    + p9.ggtitle("Competitive Division: average TD against per game"))

In [None]:
(p9.ggplot(data = res.query('n_games > 100'), 
            mapping = p9.aes(y = 'reorder(race_name, avg_td)', x = 'avg_td', 
                            size = 'n_games', group = 'factor(tv_bin)', 
                            color = 'factor(tv_bin)'))
    + p9.geom_point()
    + p9.scale_size_area() 
    + p9.ggtitle("Competitive Division: average TD per game"))

# Top star players ranking

In [None]:
# top  star players in BB2020
divisions = ['Blackbox', 'Competitive']
period_range = pd.to_datetime(["2024-09-01", "2024-08-01", "2024-07-01"]) #Q3 2024
period_range = pd.to_datetime(["2024-10-01", "2024-11-01", "2024-12-01"]) #Q4 2024

# roll inducements up to Q3 by race_name
sp_count = (inducements
.merge(df_mbt[['match_id', 'division_name', 'race_name', 'month_date', 'tv_diff', 'team']], how='left', on=['match_id', 'team']) # add racename to inducements
.query("star_player == 1 and division_name in @divisions & month_date in @period_range") # filter 
.assign(Name = lambda x:x['inducements'].str.replace(r'Star player ', '', regex=True))
.groupby(['Name', 'race_name'])
.agg(
    n_games = ('match_id', 'count'),
    med_tv_diff = ('tv_diff', 'median')  
)
.reset_index() # remove grouping structure
.sort_values('n_games',ascending = False)
.groupby(['Name'])
.agg(
    n_games_tot = ('n_games', 'sum'),
    n_races = ('race_name', 'nunique'),
    med_tv_diff = ('med_tv_diff', 'median'),
    highest_race = ('race_name', lambda x: x.head(1)),
    highest_games = ('n_games', 'max')

)
.reset_index() # remove grouping structure
.assign(rank = lambda x:x['n_games_tot'].rank(method = 'first', ascending = False))
.sort_values('rank',ascending = True)
.query('rank < 11')
)

sp_count[['rank', 'Name', 'n_games_tot', 'n_races']]

In [None]:
divisions = ['Blackbox', 'Competitive']
period_range = pd.to_datetime(["2024-09-01", "2024-08-01", "2024-07-01"]) #Q3 2024
#period_range = ["2024-07-01"] #first FB post
res = (df_matches
.query("division_name in @divisions & month_date in @period_range")
.assign(period = 'Q3')
.groupby(['period'])
.agg(
    n_games = ('match_id', 'count'),
    perc_sp = ('has_sp', 'mean'),
    n_sp = ('has_sp', 'sum'),
)
.reset_index()
#.sort_values("month_date", ascending=False)
)
res

Update on which Star Players are popular choices to induce.

In the third quarter of 2024, 18108 games of Blood Bowl were played online on FUMBBL (https://fumbbl.com). 

Of these, 10098 games were played in the Competitive and Black Box divisions.
In 15% of these games, at least one star player was induced. These were the top 10 star players induced.

    Name              n_games n_diff_races
1.	Fungus the Loon	      375	       9
2.	Akhorne the Squirrel	291	      28
3.	Helmut Wulf	          140	      27
4.	Skitter Stab-Stab	     94	      10
5.	Rodney Roachbait	     91	       2
6.	Varag Ghoul-Chewer	   90	       8
7.	Puggy Baconbreath	     78	       8
8.	Nobbla Blackwart	     77	       8
9.	Rumbelow Sheepskin	   52	       8
10.	Mighty Zug	           46	       9

The third column counts by how many different teams the Star Player was induced.
Rodney Roachbait for example was only induced by two different team races (Gnomes and Halflings).

# BB2020 game volume by race

In [None]:
# top 26 most popular races in FUMBBL BB2020
top10 = (df_mbt
.query("division_name == 'Competitive'") # filter 
.groupby(['race_name'])
.agg(
    n_games = ('match_id', 'count')
)
.reset_index()
.sort_values('n_games',ascending = False)
.head(30)['race_name'])



divisions = [ 'Competitive']

res = (df_mbt
.query("division_name in @divisions and race_name in @top10")
.groupby(['division_name', 'race_name', 'week_date'])
.agg(
    n_games = ('match_id', 'count')
)
.reset_index()
.sort_values("n_games", ascending=False)
)

res2 = (df_mbt
.assign(race_name = 'total')
.query("division_name in @divisions")
.groupby(['division_name', 'race_name', 'week_date'])
.agg(
    n_games = ('match_id', 'count')
)
.reset_index()
.sort_values("n_games", ascending=False)
)

resx = pd.concat([res, res2], axis = 0)

(p9.ggplot(data = resx.query("week_date > '2024-01-01'"), mapping = p9.aes(x = 'week_date', y = 'n_games', 
group = 'factor(race_name)', color = 'factor(race_name)'))
    + p9.geom_point() 
    + p9.geom_line() 
    + p9.expand_limits(y=[0,1])
    + p9.facet_wrap('race_name', scales = 'free_y', ncol = 3)
    + p9.scale_x_datetime(labels = date_format('%b'))   
    #+ p9.geom_vline(xintercept = '2021-09-01', color = "red")
    + p9.ggtitle("BB2020 Matches by race")
    + p9.theme(figure_size = (10, 12))
    + p9.ylab("Number of matches") 
    + p9.guides(color = False)
    + p9.theme(subplots_adjust={'wspace': 0.25}))
    

# inducements for a given race, as a function of CTV difference

In [None]:
divisions = ['Blackbox', 'Competitive', 'League']

res = df_mbt.query("match_date > '2024-05-28' division_name in @divisions")

match_ids = res['match_id'].values

In [None]:
res = (inducements.query("match_id in @match_ids")
.query('inducements != "1 bribe" & star_player == 1')
.groupby(['inducements'])
.agg(
    n_games = ('inducements', 'count')
)
.reset_index()
.sort_values("n_games", ascending=False)
)
res[1:20]

# check out the group structures

There is the NAF online Tournaments group. And a french league. And a US league.

In [None]:
res = (df_matches
    .loc[(df_matches['week_date'] >= '2023-08-01' ) & (df_matches['week_date'] < '2025-03-25')]
    .groupby(['scheduler', 'division_id', 'division_name', 'tournament_id', 'group_id', 'group_name',
'tournament_type', 'tournament_progression',
'tournament_name', 'tournament_start'], dropna=False)
    .agg(        
        n_games = ('match_id', "count") ,
        start_week = ('week_date', min),
        end_week = ('week_date', max)
    )
    .reset_index()
    .sort_values("start_week", ascending=True)
    ) # this adds the "group by" variables back as columns of res

res.query('n_games > 50 & division_id == 2')

In [None]:
res = (df_matches
    .loc[(df_matches['week_date'] >= '2023-08-01' ) & (df_matches['week_date'] < '2025-03-25')]
    .groupby(['scheduler', 'division_id', 'division_name'], dropna=False)
    .agg(        
        n_games = ('match_id', "count") ,
        start_week = ('week_date', min),
        end_week = ('week_date', max)
    )
    .reset_index()
    .sort_values("start_week", ascending=True)
    ) # this adds the "group by" variables back as columns of res

res

In [None]:
# select BB11 Trophy rosters

(df_mbt
    .query('team_id == 1192881 & scheduler == "Blackbox"')
    .sort_values('match_date')
)