# Votes Vs Odds

In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('sofascore_data.xlsx')

# Re-calculate implied probabilities for odds
df['impl prob home'] = df['odd away'] / (df['odd away'] + df['odd home'])
df['impl prob away'] = df['odd home'] / (df['odd away'] + df['odd home'])

### Run betting simulation where we bet against voters when discrepancy between votes and odds is large. Calculate profit per bet.

In [205]:
discrepancies = np.arange(0, 1, 0.1)
votes_ = np.arange(1000, 10000, 1000)

bets = []

for discrepancy in discrepancies:
    for votes in votes_:
        high_votes = df[df['total votes'] > votes]
        high_votes = high_votes[(abs(high_votes['votes prob away'] - high_votes['impl prob away'])/high_votes['votes prob away'] > discrepancy) |
                                (abs(high_votes['votes prob home'] - high_votes['impl prob home'])/high_votes['votes prob home'] > discrepancy)]

        if len(high_votes) < 100:
            break
        
        balance = 1000
        unit_bet = 1

        for id, event in high_votes.iterrows():
            if event['votes prob away'] > event['impl prob away']:
                
                bet = {'Event': event['players'],
                      'Sign': 1,
                      'Outcome': event['winner'],
                      'Profit': unit_bet-unit_bet / event['odd home'] if event['winner'] == 1 else -unit_bet / event['odd home'],
                      'Odds': event['odd home'],
                      'votes threshold': votes,
                      'discrepancy': discrepancy
                      }
                bets.append(bet)
                balance += bet['Profit']
                
            if event['votes prob away'] < event['impl prob away']:
                
                bet = {'Event': event['players'],
                      'Sign': 2,
                      'Outcome': event['winner'],
                      'Profit': unit_bet-unit_bet / event['odd away'] if event['winner'] == 2 else -unit_bet / event['odd away'],
                      'Odds': event['odd away'],
                      'votes threshold': votes,
                      'discrepancy': discrepancy
                      }
                bets.append(bet)
                balance += bet['Profit']

bets = pd.DataFrame(bets)

### Create graph for profit per bet vs 'votes threshold' and 'discrepancy'

In [198]:
profit_per_bet = bets.groupby(['votes threshold', 'discrepancy']).apply(lambda x: x['Profit'].sum() / len(x)).reset_index(name='profit_per_bet')

# Pivot table to get grid
pivot = profit_per_bet.pivot(index='discrepancy', columns='votes threshold', values='profit_per_bet')

X = pivot.columns.values
Y = pivot.index.values
X, Y = np.meshgrid(X, Y)
Z = pivot.values

import plotly.graph_objects as go

fig = go.Figure(data=[go.Surface(
    z=Z, x=X, y=Y, colorscale='Viridis', 
    contours={"z": {"show": True, "usecolormap": True, "highlightcolor": "limegreen", "project_z": True}}
)])

fig.update_layout(
    title='Profit per Bet by Votes Threshold and Discrepancy',
    scene=dict(
        xaxis_title='Votes Threshold',
        yaxis_title='Discrepancy',
        zaxis_title='Profit per Bet'
    ),
    autosize=True,
    width=900,
    height=700,
    margin=dict(l=50, r=50, b=50, t=50)
)

fig.show()






### Print summary of results

In [192]:
bets1 = bets[(bets['votes threshold'] == 5000) & (bets['discrepancy'] == 0.4)]

print(bets1['Profit'].sum())
# print(balance)
print((bets1['Profit']>0).sum())
print(len(bets1))
print(bets1['Profit'].sum() / len(bets1))

2.137676575266029
141
321
0.006659428583383268


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

# Navigate directly to the API endpoint
driver.get('https://www.sofascore.com/api/v1/sport/football/scheduled-events/2025-10-20')
time.sleep(2)

# The API returns JSON that gets displayed as text
page_source = driver.page_source
driver.quit()

# Extract JSON from the page
# It's usually wrapped in <pre> tags
start = page_source.find('{')
end = page_source.rfind('}') + 1
json_data = json.loads(page_source[start:end])

print(json_data)

{'error': {'code': 403, 'reason': 'challenge'}}


## Scrape all football matches in 2025

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json

dfs = []

dates = pd.date_range('2025-01-01','2025-10-20').date

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

for date in dates:
    
    # Make the request
    # Navigate directly to the API endpoint
    driver.get(f'https://www.sofascore.com/api/v1/sport/football/scheduled-events/{date}')
    time.sleep(2)

    # The API returns JSON that gets displayed as text
    page_source = driver.page_source

    # Extract JSON from the page
    # It's usually wrapped in <pre> tags
    start = page_source.find('{')
    end = page_source.rfind('}') + 1
    json_data = json.loads(page_source[start:end])

    for event in json_data['events']:
        if 'winnerCode' in event.keys():
            dfs.append({
                'winner':event['winnerCode'],
                'match_id':event['id'],
                'players':event['slug'],
                'start timestamp':event['startTimestamp']
                })
            
driver.quit()
        
df = pd.DataFrame(dfs)        
df = df.set_index('match_id')
df = df.drop_duplicates()

In [5]:
df.to_excel('football_matches.xlsx')

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json
import pandas as pd
import numpy as np

df = pd.read_excel('football_matches.xlsx')

# Function that converts fractional odds into decimal odds
def odd_converter(odd):
    
    try:
    
        split_odd = odd.split('/')
        return (int(split_odd[0])+int(split_odd[1]))/int(split_odd[1])
    
    except:
        
        return np.nan

odd_home = []
odd_away = []
votes1=[]
votes2=[]

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

# For each match in the dataset, store odds for home victory and away victory
for matchid in df.index:
    
    try:
        # Make the request
        # Navigate directly to the API endpoint
        driver.get(f'https://www.sofascore.com/api/v1/event/{matchid}/provider/833/winning-odds')
        # time.sleep(2)

        # The API returns JSON that gets displayed as text
        page_source = driver.page_source

        # Extract JSON from the page
        # It's usually wrapped in <pre> tags
        start = page_source.find('{')
        end = page_source.rfind('}') + 1
        json_data = json.loads(page_source[start:end])
        
        odd1 = json_data['featured']['fullTime']['choices'][0]['fractionalValue']
        odd2 = json_data['featured']['fullTime']['choices'][1]['fractionalValue']
        
        odd_home.append(odd_converter(odd1))
        odd_away.append(odd_converter(odd2))

        # Make the request
        # Navigate directly to the API endpoint
        driver.get(f'https://www.sofascore.com/api/v1/event/{matchid}/votes')
        # time.sleep(2)

        # The API returns JSON that gets displayed as text
        page_source = driver.page_source

        # Extract JSON from the page
        # It's usually wrapped in <pre> tags
        start = page_source.find('{')
        end = page_source.rfind('}') + 1
        json_data = json.loads(page_source[start:end])
        
        votes1.append(json_data['vote']['vote1'])
        votes2.append(json_data['vote']['vote2'])
    
    except:
        odd_home.append(np.nan)
        odd_away.append(np.nan)
    
driver.quit()
    
df['odd home'] = odd_home
df['odd away'] = odd_away
df['votes home'] = votes1
df['votes away'] = votes2

ValueError: Length of values (0) does not match length of index (41143)

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json
import pandas as pd
import numpy as np

df = pd.read_excel('football_matches.xlsx')

# Function that converts fractional odds into decimal odds
def odd_converter(odd):
    
    try:
    
        split_odd = odd.split('/')
        return (int(split_odd[0])+int(split_odd[1]))/int(split_odd[1])
    
    except:
        
        return np.nan

odd_home = []
odd_away = []
votes1=[]
votes2=[]

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

driver.get(f'https://www.sofascore.com/api/v1/event/14056467/provider/833/winning-odds')
time.sleep(2)

# The API returns JSON that gets displayed as text
page_source = driver.page_source

# Extract JSON from the page
# It's usually wrapped in <pre> tags
start = page_source.find('{')
end = page_source.rfind('}') + 1
json_data = json.loads(page_source[start:end])

driver.quit()

In [None]:
# Function that converts fractional odds into decimal odds
def odd_converter(odd):
    
    try:
    
        split_odd = odd.split('/')
        return (int(split_odd[0])+int(split_odd[1]))/int(split_odd[1])
    
    except:
        
        return np.nan

odd_home = []
odd_away = []

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

# For each match in the dataset, store odds for home victory and away victory
for matchid in df.index:
    
    try:
        # Make the request
        # Navigate directly to the API endpoint
        driver.get(f'https://www.sofascore.com/api/v1/event/{matchid}/provider/833/winning-odds')
        time.sleep(2)

        # The API returns JSON that gets displayed as text
        page_source = driver.page_source

        # Extract JSON from the page
        # It's usually wrapped in <pre> tags
        start = page_source.find('{')
        end = page_source.rfind('}') + 1
        json_data = json.loads(page_source[start:end])
        
        odd1 = json_data['featured']['fullTime']['choices'][0]['fractionalValue']
        odd2 = json_data['featured']['fullTime']['choices'][1]['fractionalValue']
        
        odd_home.append(odd_converter(odd1))
        odd_away.append(odd_converter(odd2))
    
    except:
        odd_home.append(np.nan)
        odd_away.append(np.nan)
    
driver.quit()
    
df['odd home'] = odd_home
df['odd away'] = odd_away

########################################

"""Get votes for all matches in df"""

votes1=[]
votes2=[]

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(options=options)

# For all matches in the dataset, store the user votes for home victory and away victory
for matchid in df.index:

    # Make the request
    # Navigate directly to the API endpoint
    driver.get(f'https://www.sofascore.com/api/v1/event/{matchid}/votes')
    time.sleep(2)

    # The API returns JSON that gets displayed as text
    page_source = driver.page_source

    # Extract JSON from the page
    # It's usually wrapped in <pre> tags
    start = page_source.find('{')
    end = page_source.rfind('}') + 1
    json_data = json.loads(page_source[start:end])
    
    votes1.append(json_data['vote']['vote1'])
    votes2.append(json_data['vote']['vote2'])
    
driver.quit()

df['votes home'] = votes1
df['votes away'] = votes2