# Get betting data

In [19]:
import pandas as pd
import numpy as np
import requests
import lxml
import cchardet
import random
import time

from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [2]:
# bout_urls = pd.read_csv("data/bout_urls.csv")
# temp = bout_urls.groupby("event").first().reset_index(drop=False)
# temp = temp.sort_values(by=["date", "event"]).reset_index(drop=True)
# temp

In [3]:
# temp2 = temp[["event", "date"]]
# temp2.to_csv("data/bestfightodds_urls.csv", index=False, header=True)

We have to manually get all the links to the fights on bestfightodds.

In [4]:
odds_links = pd.read_csv("data/bestfightodds_urls.csv")
odds_links

Unnamed: 0,event,date,url
0,UFC 1: The Beginning,1993-11-12,
1,UFC 2: No Way Out,1994-03-11,
2,UFC 3: The American Dream,1994-09-09,
3,UFC 4: Revenge of the Warriors,1994-12-16,
4,UFC 5: The Return of the Beast,1995-04-07,
...,...,...,...
1067,UFC 284: Makhachev vs. Volkanovski,2023-02-11,https://www.bestfightodds.com/events/ufc-284-m...
1068,UFC Fight Night: Andrade vs. Blanchfield,2023-02-18,https://www.bestfightodds.com/events/ufc-fight...
1069,UFC Fight Night: Muniz vs. Allen,2023-02-25,https://www.bestfightodds.com/events/ufc-fight...
1070,UFC 285: Jones vs. Gane,2023-03-04,https://www.bestfightodds.com/events/ufc-285-2738


### Scrape money line odds from links

Note: Very important, try not to run the following cells it's not worth it

In [5]:
# Select events that actually have links on bestfightodds
odds_links2 = odds_links[~odds_links["url"].isna()]
odds_links2

Unnamed: 0,event,date,url
304,UFC 73: Stacked,2007-07-07,https://www.bestfightodds.com/events/ufc-73-st...
309,UFC 74: Respect,2007-08-25,https://www.bestfightodds.com/events/ufc-74-re...
311,UFC 75: Champion vs Champion,2007-09-08,https://www.bestfightodds.com/events/ufc-75-ch...
312,EliteXC - Uprising,2007-09-15,https://www.bestfightodds.com/events/elitexc-u...
314,UFC Fight Night: Thomas vs Florian,2007-09-19,https://www.bestfightodds.com/events/ufc-fight...
...,...,...,...
1067,UFC 284: Makhachev vs. Volkanovski,2023-02-11,https://www.bestfightodds.com/events/ufc-284-m...
1068,UFC Fight Night: Andrade vs. Blanchfield,2023-02-18,https://www.bestfightodds.com/events/ufc-fight...
1069,UFC Fight Night: Muniz vs. Allen,2023-02-25,https://www.bestfightodds.com/events/ufc-fight...
1070,UFC 285: Jones vs. Gane,2023-03-04,https://www.bestfightodds.com/events/ufc-285-2738


In [20]:
def scrape_odds(event, date, url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "lxml")
    tables = soup.findAll("table", {"class": "odds-table"})
    
    if not tables:
        print(url)
    
    table = tables[1].find("tbody")
    rows = table.findAll("tr", {"class": ""})
    
    result = []
    for row in rows:
        name = row.find("span", {"class": "t-b-fcc"})
        if not name:
            break
        res = [event, date, name.text.strip()]
        cells = row.findAll("td")
        for cell in cells:
            odds = cell.find("span")
            if odds:
                res.append(odds.text.strip())
            else:
                res.append(np.nan)
        result.append(res[:-2])
    
    return result

In [21]:
batch1 = odds_links2.iloc[:100]
batch2 = odds_links2.iloc[100:200]
batch3 = odds_links2.iloc[200:300]
batch4 = odds_links2.iloc[300:400]
batch5 = odds_links2.iloc[400:500]
batch6 = odds_links2.iloc[500:600]
batch7 = odds_links2.iloc[600:700]
batch8 = odds_links2.iloc[700:]

In [22]:
# Batch 1
moneyline1 = []
for index, row in tqdm(batch1.iterrows(), total=batch1.shape[0], desc="Scraping odds (Batch #1)"):
    moneyline1 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #1): 100%|██████████████████████████████████████████████████████| 100/100 [00:27<00:00,  3.68it/s]


In [23]:
# Batch 2
moneyline2 = []
for index, row in tqdm(batch2.iterrows(), total=batch2.shape[0], desc="Scraping odds (Batch #2)"):
    moneyline2 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #2): 100%|██████████████████████████████████████████████████████| 100/100 [00:23<00:00,  4.30it/s]


In [25]:
# Batch 3
moneyline3 = []
for index, row in tqdm(batch3.iterrows(), total=batch3.shape[0], desc="Scraping odds (Batch #3)"):
    moneyline3 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #3): 100%|██████████████████████████████████████████████████████| 100/100 [00:42<00:00,  2.35it/s]


In [26]:
# Batch 4
moneyline4 = []
for index, row in tqdm(batch4.iterrows(), total=batch4.shape[0], desc="Scraping odds (Batch #4)"):
    moneyline4 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #4): 100%|██████████████████████████████████████████████████████| 100/100 [01:38<00:00,  1.01it/s]


In [27]:
# Batch 5
moneyline5 = []
for index, row in tqdm(batch5.iterrows(), total=batch5.shape[0], desc="Scraping odds (Batch #5)"):
    moneyline5 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #5): 100%|██████████████████████████████████████████████████████| 100/100 [01:26<00:00,  1.15it/s]


In [28]:
# Batch 6
moneyline6 = []
for index, row in tqdm(batch6.iterrows(), total=batch6.shape[0], desc="Scraping odds (Batch #6)"):
    moneyline6 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #6): 100%|██████████████████████████████████████████████████████| 100/100 [01:36<00:00,  1.04it/s]


In [36]:
# Batch 7
moneyline7_1 = []
batch7_1 = batch7.iloc[:25]
for index, row in tqdm(batch7_1.iterrows(), total=batch7_1.shape[0], desc="Scraping odds (Batch #7_1)"):
    moneyline7_1 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #7_1): 100%|██████████████████████████████████████████████████████| 25/25 [00:54<00:00,  2.17s/it]


In [37]:
moneyline7_2 = []
batch7_2 = batch7.iloc[25:50]
for index, row in tqdm(batch7_2.iterrows(), total=batch7_2.shape[0], desc="Scraping odds (Batch #7_2)"):
    moneyline7_2 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #7_2): 100%|██████████████████████████████████████████████████████| 25/25 [01:25<00:00,  3.41s/it]


In [38]:
moneyline7_3 = []
batch7_3 = batch7.iloc[50:75]
for index, row in tqdm(batch7_3.iterrows(), total=batch7_3.shape[0], desc="Scraping odds (Batch #7_3)"):
    moneyline7_3 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #7_3): 100%|██████████████████████████████████████████████████████| 25/25 [02:13<00:00,  5.34s/it]


In [40]:
moneyline7_4 = []
batch7_4 = batch7.iloc[75:]
for index, row in tqdm(batch7_4.iterrows(), total=batch7_4.shape[0], desc="Scraping odds (Batch #7_4)"):
    time.sleep(3)
    moneyline7_4 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #7_4): 100%|██████████████████████████████████████████████████████| 25/25 [02:29<00:00,  5.97s/it]


In [41]:
moneyline7 = moneyline7_1 + moneyline7_2 + moneyline7_3 + moneyline7_4
moneyline7
len(moneyline7)

2294

In [42]:
# Batch 8
moneyline8 = []
for index, row in tqdm(batch8.iterrows(), total=batch8.shape[0], desc="Scraping odds (Batch #8)"):
    time.sleep(3)
    moneyline8 += scrape_odds(row["event"], row["date"], row["url"])

Scraping odds (Batch #8): 100%|████████████████████████████████████████████████████████| 31/31 [03:51<00:00,  7.45s/it]


In [43]:
moneyline = moneyline1 + moneyline2 + moneyline3 + moneyline4 + moneyline5 + moneyline6 + moneyline7 + moneyline8
cols = ["event", "date", "name", "DraftKings", "BetMGM", "Caesars", "BetRivers", "FanDuel", "PointsBet", "Unibet", "Bet365", 
        "BetWay", "5D", "Ref"]
df = pd.DataFrame(moneyline, columns=cols)
df

Unnamed: 0,event,date,name,DraftKings,BetMGM,Caesars,BetRivers,FanDuel,PointsBet,Unibet,Bet365,BetWay,5D,Ref
0,UFC 73: Stacked,2007-07-07,Anderson Silva,,,,,,,,,-145,,
1,UFC 73: Stacked,2007-07-07,Nate Marquardt,,,,,,,,,+125,,
2,UFC 73: Stacked,2007-07-07,Rashad Evans,,,,,,,,,-110,,
3,UFC 73: Stacked,2007-07-07,Tito Ortiz,,,,,,,,,-110,,
4,UFC 73: Stacked,2007-07-07,Hermes Franca,,,,,,,,,+225,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15245,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Jj Aldrich,,-350,-440,-400,-375,,-400,-400,-400,-380,-440
15246,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Bruno Gustavo da Silva,,-190,-175,-195,-180,,-195,-188,-188,-190,-170
15247,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Tyson Nam,,+155,+150,+155,+152,,+155,+163,+150,+165,+145
15248,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Carlston Harris,,-300,-310,-315,-300,,-315,-400,-350,,-300


In [45]:
df.to_csv("data/bestfightodds_raw.csv", index=False, header=True)