The research question is what experience profile maximizes productivity?
For example, consider consider a course with three sections (one quiz per section) and a final. 
Quiz A is easy (everyone gets 100), Quiz B is medium (everyone gets 80), and Quiz C is hard (everyone gets 60). 
The order A-B-C reflects a declining success sequence (“I lost it”). 
In contrast, the order C-B-A reflects an improving sequence (“I got better”). 
Therefore, I hypothesize that the order C-B-A would lead to better performance in the final than A-B-C. 
Such findings could extend to management (assigning tasks to new employees) and experimental design (ordering blocks).

# Imported Data (jfjelstul)

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Load data
matches = pd.read_csv(r'C:\Documents\Pitt\Research\Memory and Ability Expectation\WC Data\jfjelstul-worldcup-github\matches.csv')
group_standings = pd.read_csv(r'C:\Documents\Pitt\Research\Memory and Ability Expectation\WC Data\jfjelstul-worldcup-github\group_standings.csv')

# Limit to 1990 and above (no group stage before)
matches['year'] = matches['tournament_id'].str[-4:].astype(int)
group_matches = matches.loc[matches['year']>=1990].copy()
group_matches = group_matches.loc[group_matches['stage_name']=='group stage']
group_matches['match_date'] =  pd.to_datetime(group_matches['match_date'])
group_matches = group_matches.sort_values(by=['match_date','match_time']).reset_index(drop=True)
print('Number of group stage matches after 1990:', len(group_matches))

# Limit to 1990 and above
group_standings['year'] = group_standings['tournament_id'].str[-4:].astype(int)
group_standings = group_standings.loc[group_standings['year']>=1990].copy()
# Limit to first group stage (in 1974-1982 there was a second one -- not random)
group_standings = group_standings.loc[group_standings['stage_name'].isin(['second group stage','final round'])==False].copy()
print('Number of teams after 1990:', len(group_standings))
teams = group_standings.drop_duplicates(subset=['tournament_id','team_name']).copy()
teams

Number of group stage matches after 1990: 408
Number of teams after 1990: 272


Unnamed: 0,key_id,tournament_id,tournament_name,stage_number,stage_name,group_name,position,team_id,team_name,team_code,played,wins,draws,losses,goals_for,goals_against,goal_difference,points,advanced,year
218,219,WC-1990,1990 FIFA World Cup,1,group stage,Group A,1,T-39,Italy,ITA,3,3,0,0,4,0,4,6,1,1990
219,220,WC-1990,1990 FIFA World Cup,1,group stage,Group A,2,T-20,Czechoslovakia,CSK,3,2,0,1,6,3,3,4,1,1990
220,221,WC-1990,1990 FIFA World Cup,1,group stage,Group A,3,T-05,Austria,AUT,3,1,0,2,2,3,-1,2,0,1990
221,222,WC-1990,1990 FIFA World Cup,1,group stage,Group A,4,T-80,United States,USA,3,0,0,3,2,8,-6,0,0,1990
222,223,WC-1990,1990 FIFA World Cup,1,group stage,Group B,1,T-11,Cameroon,CMR,3,2,0,1,3,5,-2,4,1,1990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,486,WC-2022,2022 FIFA World Cup,1,group stage,Group G,4,T-64,Serbia,SRB,3,0,1,2,5,8,-3,1,0,2022
486,487,WC-2022,2022 FIFA World Cup,1,group stage,Group H,1,T-56,Portugal,PRT,3,2,0,1,6,4,2,6,1,2022
487,488,WC-2022,2022 FIFA World Cup,1,group stage,Group H,2,T-69,South Korea,KOR,3,1,1,1,4,4,0,4,1,2022
488,489,WC-2022,2022 FIFA World Cup,1,group stage,Group H,3,T-81,Uruguay,URY,3,1,1,1,2,2,0,4,0,2022


# Scraped Data

In [2]:
df = pd.read_csv(r'C:\Documents\Pitt\Research\Memory and Ability Expectation\WC Data\TeamRankings.csv')
df

Unnamed: 0,tournament_id,team_name,fifa_rank,betting_odds,fifa_retro_score,Grp,GP,W,D,L,GF,GA,GD,Pts,Result
0,WC-1950,Brazil,,,,,,,,,,,,,
1,WC-1950,Yugoslavia,,,,,,,,,,,,,
2,WC-1950,Switzerland,,,,,,,,,,,,,
3,WC-1950,Mexico,,,,,,,,,,,,,
4,WC-1950,Spain,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,WC-2022,Serbia,21.0,100.0,29.0,G,3.0,0.0,1.0,2.0,5.0,8.0,-3,1.0,Eliminated in the group stage
441,WC-2022,Portugal,9.0,14.0,8.0,H,5.0,3.0,0.0,2.0,12.0,6.0,6,9.0,Eliminated in the quarter-finals
442,WC-2022,South Korea,28.0,250.0,16.0,H,4.0,1.0,1.0,2.0,5.0,8.0,-3,4.0,Eliminated in the round of 16
443,WC-2022,Uruguay,14.0,50.0,20.0,H,3.0,1.0,1.0,1.0,2.0,2.0,0,4.0,Eliminated in the group stage


In [3]:
# FIFA rank, betting odds, within group ranking

df = pd.read_csv(r'C:\Documents\Pitt\Research\Memory and Ability Expectation\WC Data\TeamRankings.csv')

# Keep only tournaments with betting odds or fifa rank (i.e., after 1990)
df['year'] = df['tournament_id'].str[-4:].astype(int)
df = df.loc[df['year']>=1990].copy()
df.reset_index(drop=True,inplace=True)

# Rank teams by betting odds

# to break ties in betting odds, use fifa rank  
df = df.sort_values(by=['year','Grp','fifa_rank']).reset_index(drop=True)
df['within_group_rank'] = df.groupby(['year','Grp'],as_index=False)['betting_odds'].rank('first')

# In the group stage, each team has one hard match (i.e., against the team with the lowest betting odds in their group)
# one medium match, and one easy match.
df[['hard_team_stregnth','med_team_stregnth','easy_team_stregnth']] = np.NaN, np.NaN, np.NaN
with pd.option_context('mode.chained_assignment', None):
    for i in range(len(df)):
        # Create a list of betting ods rank
        temp = df.loc[(df['year']==df['year'][i]) & (df['Grp']==df['Grp'][i])]
        temp = temp['betting_odds'].nlargest(4).reset_index(drop=True)
        # Assign appropriate values for each team's within group rank
        if df['betting_odds'][i] == temp[3]:
            df['hard_team_stregnth'][i], df['med_team_stregnth'][i], df['easy_team_stregnth'][i] = temp[2], temp[1], temp[0]
        elif df['betting_odds'][i] == temp[2]:
            df['hard_team_stregnth'][i], df['med_team_stregnth'][i], df['easy_team_stregnth'][i] = temp[3], temp[1], temp[0]
        elif df['betting_odds'][i] == temp[1]:
            df['hard_team_stregnth'][i], df['med_team_stregnth'][i], df['easy_team_stregnth'][i] = temp[3], temp[2], temp[0]
        else:
            df['hard_team_stregnth'][i], df['med_team_stregnth'][i], df['easy_team_stregnth'][i] = temp[3], temp[2], temp[1]

#### Merge home and away rank for every group match (based on betting odds and fifa rank)

In [4]:
# merge home team rank to matches
#print(len(group_matches))
group_matches = group_matches.merge(df[['tournament_id','team_name','within_group_rank']], 
                                    left_on=['tournament_id','home_team_name'], right_on=['tournament_id','team_name'])
#print(len(group_matches))
group_matches.drop(columns='team_name',inplace=True)
group_matches.rename(columns={'within_group_rank':'home_rank'},inplace=True)

# merge away team rank to matches
#print(len(group_matches))
group_matches = group_matches.merge(df[['tournament_id','team_name','within_group_rank']], 
                                    left_on=['tournament_id','away_team_name'], right_on=['tournament_id','team_name'])
#print(len(group_matches))
group_matches.drop(columns='team_name',inplace=True)
group_matches.rename(columns={'within_group_rank':'away_rank'},inplace=True)

group_matches

Unnamed: 0,key_id,tournament_id,tournament_name,match_id,match_name,stage_name,group_name,group_stage,knockout_stage,replayed,...,score_penalties,home_team_score_penalties,away_team_score_penalties,result,home_team_win,away_team_win,draw,year,home_rank,away_rank
0,413,WC-1990,1990 FIFA World Cup,M-1990-01,Argentina v Cameroon,group stage,Group B,1,0,0,...,0-0,0,0,away team win,0,1,0,1990,1.0,4.0
1,425,WC-1990,1990 FIFA World Cup,M-1990-13,Argentina v Soviet Union,group stage,Group B,1,0,0,...,0-0,0,0,home team win,1,0,0,1990,1.0,2.0
2,438,WC-1990,1990 FIFA World Cup,M-1990-26,Cameroon v Soviet Union,group stage,Group B,1,0,0,...,0-0,0,0,away team win,0,1,0,1990,4.0,2.0
3,437,WC-1990,1990 FIFA World Cup,M-1990-25,Argentina v Romania,group stage,Group B,1,0,0,...,0-0,0,0,draw,0,0,1,1990,1.0,3.0
4,414,WC-1990,1990 FIFA World Cup,M-1990-02,Soviet Union v Romania,group stage,Group B,1,0,0,...,0-0,0,0,away team win,0,1,0,1990,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,939,WC-2022,2022 FIFA World Cup,M-2022-39,Poland v Argentina,group stage,Group C,1,0,0,...,0-0,0,0,away team win,0,1,0,2022,3.0,1.0
404,944,WC-2022,2022 FIFA World Cup,M-2022-44,Japan v Spain,group stage,Group E,1,0,0,...,0-0,0,0,home team win,1,0,0,2022,3.0,1.0
405,942,WC-2022,2022 FIFA World Cup,M-2022-42,Croatia v Belgium,group stage,Group F,1,0,0,...,0-0,0,0,draw,0,0,1,2022,2.0,1.0
406,947,WC-2022,2022 FIFA World Cup,M-2022-47,Cameroon v Brazil,group stage,Group G,1,0,0,...,0-0,0,0,home team win,1,0,0,2022,4.0,1.0


#### Compute difficulty order and results for each team based on group matches

In [5]:
df['match_difficulty'] = "" # e.g., 123 is similar to hard-medium-easy
df['match_results'] = "" # e.g., LDW is lose-draw-win

for j in range(len(df)):
    # create a dataframe with the three group stage matches played by each team in a given WC
    year = df['year'][j]
    team = df['team_name'][j]
    temp = group_matches.loc[(group_matches['year']==year) & 
                             (group_matches['match_name'].str.contains(team, case=False))].reset_index(drop=True)
    difficulty_str = ""
    results_str = ""
    
    # Create a string with the results and difficulty order
    for i in range(3):
        if (temp['home_team_name'][i]==team): # The team of interest is the home team
            difficulty_str = difficulty_str + str(int(temp['away_rank'][i])) # within-group rank of opposition
            if (temp['home_team_win'][i]==1):
                results_str = results_str + 'W'
            elif (temp['away_team_win'][i]==1):
                results_str = results_str + 'L'
            else:
                results_str = results_str + 'D'

        else: # The team of interest is the away team
            difficulty_str = difficulty_str + str(int(temp['home_rank'][i])) # within-group rank of opposition
            if (temp['away_team_win'][i]==1):
                results_str = results_str + 'W'
            elif (temp['home_team_win'][i]==1):
                results_str = results_str + 'L'
            else:
                results_str = results_str + 'D'
            
    df.loc[j, 'match_difficulty'] = difficulty_str
    df.loc[j, 'match_results'] = results_str
    
# Convert match difficulty numbers (i.e., 1 is highest rank, and 4 is lowest) to letters
# For example, 412 is Easy-Hard-Medium (EHM)
def digits_to_letters(string_of_numbers):
    numbers = [int(num) for num in string_of_numbers]
    unique_numbers = sorted(set(numbers))
    letter_map = {unique_numbers[0]: 'H', unique_numbers[-1]: 'E'}
    medium_numbers = [num for num in numbers if num not in (unique_numbers[0], unique_numbers[-1])]
    letter_map[medium_numbers[0]] = 'M'
    letters = [letter_map[num] for num in numbers]
    return ''.join(letters)
df['match_difficulty_letters'] = df['match_difficulty'].apply(digits_to_letters) 

# Convert match result letters (e.g., WWW is win-win-win) to sequence type 
# Possibly sequences: stable, increasing, weakly increasing, decreasing, weakly decreasing, varying
def convert_sequence_category(s):
    s = pd.Series([l for l in s])
    s = s.replace({'W': 3, 'D': 2, 'L': 1})
    diffs = s.diff().iloc[1:]
    if all(diffs == 0):
        return 'stable'
    elif all(diffs > 0):
        return 'increasing'
    elif all(diffs >= 0):
        return 'weakly increasing'
    elif all(diffs < 0):
        return 'decreasing'
    elif all(diffs <= 0):
        return 'weakly decreasing'
    else:
        return 'vary'

df['result_sequence_type'] = df['match_results'].apply(convert_sequence_category)

# standardize betting odds within a world cup
df[['hard_team_stregnth_sc', 'med_team_stregnth_sc', 'easy_team_stregnth_sc', 'betting_odds_sc']] = df[['year', 'hard_team_stregnth', 'med_team_stregnth', 'easy_team_stregnth', 'betting_odds']].groupby('year').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())

# sequence of difficulty
df['hard_before_easy'] = 1*(df['match_difficulty_letters'].isin(['HEM','HME','MHE']))

df

Unnamed: 0,tournament_id,team_name,fifa_rank,betting_odds,fifa_retro_score,Grp,GP,W,D,L,...,easy_team_stregnth,match_difficulty,match_results,match_difficulty_letters,result_sequence_type,hard_team_stregnth_sc,med_team_stregnth_sc,easy_team_stregnth_sc,betting_odds_sc,hard_before_easy
0,WC-1990,Italy,,3.0,3.0,A,7.0,6.0,1.0,0.0,...,2000.0,342,WWW,MEH,stable,2.101416,1.633531,1.825084,-0.540332,0
1,WC-1990,Czechoslovakia,,40.0,6.0,A,5.0,3.0,0.0,2.0,...,2000.0,143,LWW,HEM,weakly increasing,-0.789455,1.633531,1.825084,-0.469994,1
2,WC-1990,Austria,,80.0,18.0,A,3.0,1.0,0.0,2.0,...,2000.0,142,LWL,HEM,vary,-0.789455,-0.278896,1.825084,-0.393952,1
3,WC-1990,United States,,2000.0,23.0,A,3.0,0.0,0.0,3.0,...,80.0,132,LLL,HEM,stable,-0.789455,-0.278896,-1.101758,3.256054,1
4,WC-1990,Cameroon,,500.0,7.0,B,5.0,3.0,0.0,2.0,...,50.0,123,WLW,HME,vary,-0.164402,-1.330730,-1.147490,0.404487,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,WC-2022,Cameroon,43.0,250.0,19.0,G,3.0,1.0,1.0,1.0,...,100.0,231,LDW,MEH,increasing,-0.653344,-0.528714,-1.712237,0.572870,0
268,WC-2022,Portugal,9.0,14.0,8.0,H,5.0,3.0,0.0,2.0,...,250.0,423,WWL,EHM,weakly decreasing,1.007663,1.147395,-0.562763,-0.924643,0
269,WC-2022,Uruguay,14.0,50.0,20.0,H,3.0,1.0,1.0,1.0,...,250.0,314,DLW,MHE,vary,-0.292256,1.147395,-0.562763,-0.696209,1
270,WC-2022,South Korea,28.0,250.0,16.0,H,4.0,1.0,1.0,2.0,...,250.0,241,DLW,MEH,vary,-0.292256,-0.824498,-0.562763,0.572870,0


In [6]:
# descriptives of match difficulty order and results sequence
print(df.groupby('result_sequence_type')['fifa_retro_score'].mean())
print()
print(df.groupby('match_difficulty_letters')['fifa_retro_score'].mean())
print()

# probability of results order given difficulty order
prob_matrix = pd.crosstab(df['result_sequence_type'], df['match_difficulty_letters'], normalize='index')
print(df['match_difficulty_letters'].value_counts())
prob_matrix[['EMH','HME']] # First is increasing difficulty, second is decreasing
# prob_matrix[['EMH','MEH','EHM','HME','MHE','HEM']] # First three are Hard before Easy, last three are opposite

result_sequence_type
decreasing           14.545455
increasing           15.500000
stable               17.595745
vary                 16.195652
weakly decreasing    14.333333
weakly increasing    15.257576
Name: fifa_retro_score, dtype: float64

match_difficulty_letters
EHM    13.864865
EMH    15.585366
HEM    18.126984
HME    15.960784
MEH    14.365854
MHE    15.358974
Name: fifa_retro_score, dtype: float64

match_difficulty_letters
HEM    63
HME    51
MEH    41
EMH    41
MHE    39
EHM    37
Name: count, dtype: int64


match_difficulty_letters,EMH,HME
result_sequence_type,Unnamed: 1_level_1,Unnamed: 2_level_1
decreasing,0.090909,0.0
increasing,0.0,0.375
stable,0.12766,0.234043
vary,0.152174,0.152174
weakly decreasing,0.333333,0.104167
weakly increasing,0.060606,0.272727


In [7]:
# Round of 16 odds

# Create a dataframe with the round of 16 matches with betting odds for each team
matches_16 = matches.loc[matches['stage_name']=='round of 16'].copy()
matches_16 = matches_16.loc[matches['year']>=1990].copy()
matches_16['match_date'] =  pd.to_datetime(matches_16['match_date'])
matches_16 = matches_16.sort_values(by=['match_date','match_time']).reset_index(drop=True)
# merge home team rank to matches
#print(len(group_matches))
matches_16 = matches_16.merge(df[['tournament_id','team_name','betting_odds_sc']], 
                                    left_on=['tournament_id','home_team_name'], right_on=['tournament_id','team_name'])
#print(len(group_matches))
matches_16.drop(columns='team_name',inplace=True)
matches_16.rename(columns={'betting_odds_sc':'home_odds'},inplace=True)
# merge away team rank to matches
#print(len(group_matches))
matches_16 = matches_16.merge(df[['tournament_id','team_name','betting_odds_sc']], 
                                    left_on=['tournament_id','away_team_name'], right_on=['tournament_id','team_name'])
#print(len(group_matches))
matches_16.drop(columns='team_name',inplace=True)
matches_16.rename(columns={'betting_odds_sc':'away_odds'},inplace=True)


# Create a variable of round of 16 opponnent strength per team
df['opponent_16_odds'] = np.NaN
for j in range(len(df)):
    # create a dataframe with the three group stage matches played by each team in a given WC
    year = df['year'][j]
    team = df['team_name'][j]
    temp = matches_16.loc[(matches_16['year']==year) & 
                             (matches_16['match_name'].str.contains(team, case=False))].reset_index(drop=True)
    if (len(temp)==0): # he team of interest didn't make it to round of 16
        opponent_odds = np.NaN
    elif (temp['home_team_name'][0]==team): # The team of interest is the home team
        opponent_odds = temp['away_odds'][0]
    elif (temp['away_team_name'][0]==team): # The team of interest is the away team
        opponent_odds = temp['home_odds'][0]
    df.loc[j, 'opponent_16_odds'] = opponent_odds

In [8]:
df['betting_odds'].describe()
#df['hard_team_stregnth_sc'].describe()

count     272.000000
mean      193.108761
std       360.590878
min         2.500000
25%        20.000000
50%        80.000000
75%       200.000000
max      2500.000000
Name: betting_odds, dtype: float64

In [9]:
# Regression on dummy whether eliminated in group stage
df['pass_grp_stg'] = 1*(df['Result']!='Eliminated in the group stage')
mod = smf.logit(formula='pass_grp_stg ~ hard_before_easy + betting_odds_sc + hard_team_stregnth_sc + med_team_stregnth_sc + easy_team_stregnth_sc ', data=df)
res = mod.fit(cov_type='HC3')
res.get_margeff().summary()
# res.summary()

Optimization terminated successfully.
         Current function value: 0.539864
         Iterations 6


0,1
Dep. Variable:,pass_grp_stg
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
hard_before_easy,-0.1474,0.049,-2.978,0.003,-0.244,-0.05
betting_odds_sc,-0.163,0.044,-3.709,0.0,-0.249,-0.077
hard_team_stregnth_sc,0.0752,0.032,2.382,0.017,0.013,0.137
med_team_stregnth_sc,0.0822,0.033,2.497,0.013,0.018,0.147
easy_team_stregnth_sc,0.0165,0.028,0.592,0.554,-0.038,0.071


Facing the hard team before the easy team in the group (decreasing difficulty) makes it 16% less likely for the team to pass the group stage.

In [10]:
# Regression on passing round of 16 conditional on passing group stage
df['pass_16'] = 1*(df['Result'].isin(['Eliminated in the group stage','Eliminated in the round of 16'])==False)
mod = smf.logit(formula='pass_16 ~ hard_before_easy + betting_odds_sc + opponent_16_odds', data=df.loc[df['pass_grp_stg']==1])
res = mod.fit(cov_type='HC3')
res.get_margeff().summary()
#res.summary()

Optimization terminated successfully.
         Current function value: 0.595218
         Iterations 6


0,1
Dep. Variable:,pass_16
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
hard_before_easy,0.0526,0.075,0.697,0.486,-0.095,0.2
betting_odds_sc,-0.3217,0.136,-2.372,0.018,-0.587,-0.056
opponent_16_odds,0.3157,0.137,2.312,0.021,0.048,0.583


But conditional on passign the group stage, increasing difficulty in the group stage (remembered as an improving sequence) makes it 5% more likely to pass.

In [11]:
# Regression on FIFA retro score (lower is better)
mod = smf.ols(formula='fifa_retro_score ~ betting_odds_sc + hard_team_stregnth_sc + med_team_stregnth_sc + easy_team_stregnth_sc + hard_before_easy', data=df)
# res = mod.fit(cov_type='cluster', cov_kwds={'groups': df['year']})
res = mod.fit(cov_type='HC3')
res.summary()

0,1,2,3
Dep. Variable:,fifa_retro_score,R-squared:,0.362
Model:,OLS,Adj. R-squared:,0.35
Method:,Least Squares,F-statistic:,34.26
Date:,"Fri, 10 May 2024",Prob (F-statistic):,5.61e-27
Time:,10:06:52,Log-Likelihood:,-922.35
No. Observations:,272,AIC:,1857.0
Df Residuals:,266,BIC:,1878.0
Df Model:,5,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,14.9067,0.687,21.694,0.000,13.560,16.253
betting_odds_sc,3.1871,0.516,6.176,0.000,2.176,4.199
hard_team_stregnth_sc,-2.1624,0.451,-4.791,0.000,-3.047,-1.278
med_team_stregnth_sc,-1.3937,0.516,-2.702,0.007,-2.405,-0.383
easy_team_stregnth_sc,-0.6372,0.485,-1.315,0.189,-1.587,0.313
hard_before_easy,1.5777,0.916,1.723,0.085,-0.217,3.372

0,1,2,3
Omnibus:,2.366,Durbin-Watson:,2.257
Prob(Omnibus):,0.306,Jarque-Bera (JB):,2.42
Skew:,0.197,Prob(JB):,0.298
Kurtosis:,2.757,Cond. No.,3.29


Overall, increasing difficulty leads to worse outcome by 1.5 places.

In [12]:
# Regression on FIFA retro score (lower is better) for teams passing group stage
mod = smf.ols(formula='fifa_retro_score ~ betting_odds_sc + opponent_16_odds + hard_before_easy', 
              data=df.loc[df['pass_grp_stg']==1])
# res = mod.fit(cov_type='cluster', cov_kwds={'grounps': df['year']})
res = mod.fit(cov_type='HC3')
res.summary()

0,1,2,3
Dep. Variable:,fifa_retro_score,R-squared:,0.162
Model:,OLS,Adj. R-squared:,0.144
Method:,Least Squares,F-statistic:,6.65
Date:,"Fri, 10 May 2024",Prob (F-statistic):,0.000313
Time:,10:06:52,Log-Likelihood:,-411.69
No. Observations:,144,AIC:,831.4
Df Residuals:,140,BIC:,843.3
Df Model:,3,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.9181,0.784,11.374,0.000,7.381,10.455
betting_odds_sc,2.1923,0.954,2.298,0.022,0.322,4.062
opponent_16_odds,-1.9643,0.586,-3.353,0.001,-3.113,-0.816
hard_before_easy,-0.6740,0.730,-0.923,0.356,-2.105,0.757

0,1,2,3
Omnibus:,31.888,Durbin-Watson:,2.186
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7.013
Skew:,0.046,Prob(JB):,0.03
Kurtosis:,1.923,Cond. No.,3.15


But, increasing difficulty improve outcomes for teams who managed to pass the group stage by 0.7 places.

In [13]:
# Regression on Group Stage Points (higher is better)
mod = smf.ols(formula='Pts ~ within_group_rank + hard_team_stregnth_sc + med_team_stregnth_sc + easy_team_stregnth_sc + hard_before_easy', data=df)
res = mod.fit(cov_type='HC3')
res.summary()

0,1,2,3
Dep. Variable:,Pts,R-squared:,0.419
Model:,OLS,Adj. R-squared:,0.408
Method:,Least Squares,F-statistic:,29.26
Date:,"Fri, 10 May 2024",Prob (F-statistic):,1.2199999999999999e-23
Time:,10:06:52,Log-Likelihood:,-725.12
No. Observations:,272,AIC:,1462.0
Df Residuals:,266,BIC:,1484.0
Df Model:,5,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,9.9075,0.868,11.408,0.000,8.205,11.610
within_group_rank,-1.5772,0.301,-5.246,0.000,-2.166,-0.988
hard_team_stregnth_sc,1.1164,0.319,3.494,0.000,0.490,1.743
med_team_stregnth_sc,0.5612,0.328,1.709,0.087,-0.082,1.205
easy_team_stregnth_sc,0.0230,0.234,0.098,0.922,-0.436,0.481
hard_before_easy,-0.8913,0.437,-2.041,0.041,-1.747,-0.035

0,1,2,3
Omnibus:,20.644,Durbin-Watson:,2.109
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.554
Skew:,0.583,Prob(JB):,2.83e-06
Kurtosis:,3.945,Cond. No.,11.5
