### 1. Libraries Import
- Request library, which allows to send HTTP requests
- BeautifulSoup library for pulling data out of HTML
- Pandas library to store data as dataframe
- Selenium package is used to automate web browser interaction from Python

In [14]:
# Initial imports
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re
%matplotlib inline
pd.options.display.max_columns = 999
pd.options.display.max_rows=999

import random
import urllib.request
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

from operator import itemgetter
from selenium import webdriver

### 2. Robots.txt

Before doing any web-scraping, remember to check if it's allowed or not. 
We go to http://www.oddsportal.com/robots.txt and we see that we are good to go! 

Sample of what you see on the robots.txt page:

Sitemap: http://www.oddsportal.com/sitemap.xml

User-agent: *
<br>Disallow: /redirect/
<br>Disallow: /feed/
<br>Disallow: /ajax-widget/
<br>Disallow: /terms/
<br>Disallow: /contact/
<br>Disallow: /privacy-policy/
<br>Disallow: /search/

User-agent: SmartViper
<br>Disallow: /

User-agent: Blekkobot
<br>Disallow: /

User-agent: Baiduspider
<br>Disallow: /

### 3. Reading html from the OddsPortal Page
We want to look through the html code of the website and find relevant links that we want to pull the data from. 

In this case, we want all the historical odds for the World cup (available for 2006, 2010, 2014, 2018) in the 'RESULTS' tab (http://www.oddsportal.com/soccer/world/world-cup-2018/results/) and also the odds for the upcoming world cup 2018 matches in the 'NEXT MATCHES' tab (http://www.oddsportal.com/soccer/world/world-cup-2018/)

In [17]:
# store the url 
url = "http://www.oddsportal.com/soccer/world/world-cup/results/"
browser = webdriver.Chrome()

browser.get(url)
#use beautifulsoup to get the page source of the url
page_content = BeautifulSoup(browser.page_source)

#checked the col-content that consists all the data that we need
data_table = page_content.find(attrs={'id': 'col-content'})


In [19]:
#store all year links into a list for iteration when pulling data
link_list = []
#using regular expression compiler to find the results tab 
for links in data_table.find_all('a', href=re.compile('/results/')):
    year_link = 'http://www.oddsportal.com' + links.get('href')
    link_list.append(year_link)

#remove 2002 and 1998 as there's no data
link_list = list(set(link_list [:-2]))
link_list


['http://www.oddsportal.com/soccer/world/world-cup-2006/results/',
 'http://www.oddsportal.com/soccer/world/world-cup-2014/results/',
 'http://www.oddsportal.com/soccer/world/world-cup-2010/results/',
 'http://www.oddsportal.com/soccer/world/world-cup-2018/results/']

In [25]:
# we see that for each year, there are multiple pages of data ending with'#/page/' + some_page_number
# which we want to append to the end of the links obtained above
nums = list(range(2,19))
pages = ['#/']
for i,j in enumerate(nums):
    pages.insert(i+1, str('#/page/'+ str(j) +'/'))
pages

final_pages = [x+y for x in link_list for y in pages]
print('final links look like: \n', final_pages[:3])

final links look like: 
 ['http://www.oddsportal.com/soccer/world/world-cup-2006/results/#/', 'http://www.oddsportal.com/soccer/world/world-cup-2006/results/#/page/2/', 'http://www.oddsportal.com/soccer/world/world-cup-2006/results/#/page/3/']


### 4. Obtaining group statistics data from the individual World Cup pages

For each link in groups statistics, we want the page_title to extract the year of the world cup (appended to our data), and also the tournamentTable id tag which contains all the odds per game, stored as odds_table. We use a while loop to do this.

In [37]:
append_data = []
counter = 0

while counter < len(final_pages) and len(final_pages) > 0:
    url = final_pages[counter]
    browser.get(url)

    page_content = BeautifulSoup(browser.page_source)
#     print(counter)
    page_title = page_content.title.text
#     print(page_title)

    world_cup = page_title.split(' ')[2] #3rd elem in page_title contains year

    #find odds table
    odds_table = page_content.find('div', attrs={'id':'tournamentTable'})
    
    #participating countries
    all_participants = [participant.get_text().split('-') for participant in odds_table.select('.table-participant')]
    home = []
    away = []
    for i in range(len(all_participants)):
        home.append(all_participants[i][0].strip())
        away.append(all_participants[i][1].strip())

    #get scores of matches if you want this data point
#     score = [score.get_text() for score in odds_table.select('.table-score')]


    # 1, X and 2 odds
    #"B's" column indicates number of bookmakers offering World Cup 2018 betting odds on a specific soccer match. 
    #Columns 1, X and 2 serve for average/biggest World Cup 2018 betting odds offered on home team to win, draw and 
    #away team to win the World Cup 2018 match. 

    all_bets = [bets.get_text() for bets in odds_table.select('.odds-nowrp')]
    ones = []
    cross = []
    twos = []
    for i in range(0,len(all_bets),3):
        ones.append(all_bets[i])
        cross.append(all_bets[i+1])
        twos.append(all_bets[i+2])

    #store all data points in a dataframe for each row   
    year_table = pd.DataFrame({'year':world_cup,
                              'home': home,
                              'away':away,
#                               'score':score,
                              'ones':ones,
                               'cross':cross,
                              'twos':twos})
    #store each tournament table as a list
    append_data.append(year_table)
    counter += 1
    
#convert into dataframe
worldcup_data = pd.concat(append_data, axis=0, ignore_index=True)

0
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
1
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
2
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
3
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
4
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
5
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
6
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
7
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
8
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
9
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
10
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
11
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
12
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
13
World Cup 2006 Results & Historical Odds, Soccer World Archive
2006
14
World Cup 200

In [8]:
#take a look at the dataframe
worldcup_data.head()

Unnamed: 0,away,cross,home,ones,twos,year
0,France,2.81,Italy,2.54,3.24,2006
1,Portugal,3.41,Germany,1.88,4.30,2006
2,France,2.86,Portugal,3.92,2.20,2006
3,Italy,2.89,Germany,2.46,3.27,2006
4,France,3.41,Brazil,1.75,4.99,2006
5,Portugal,2.99,England,2.10,4.00,2006
6,Ukraine,3.48,Italy,1.57,6.94,2006
7,Argentina,3.07,Germany,2.50,2.99,2006
8,France,3.01,Spain,2.37,3.21,2006
9,Ghana,5.41,Brazil,1.27,10.34,2006


In [9]:
# check distribution of data by year
worldcup_data['year'].value_counts()

2010    900
2018    900
2014    900
2006    450
Name: year, dtype: int64

### Some background info about betting odds
A few more examples of decimals to probability:

1.25 = 100/1.25 = 80% <br>
1.90 = 100/1.90 = 52.63%<br>
Not only is it easier to see what chance the bet has as a percentage, but it also gives us the ability to see just how much of a profit the bookmaker is actually making. To do this we need to convert all the odds on an event into percentages and then add them together. Any amount over 100% is the profit that the bookmaker is making on that event.

Taking match odds on a football match as an example, there are three possible results home win, away win or draw. Assuming the game is priced as follows:
 
Home win = 1.75 (57.14%)<br>
Away win = 3.9 (25.64%)<br>
Draw = 3.5 (28.57%)<br>
We want lower odds for 'ones': implies higher probability for home to win 

### 5. Repeat for the 'Next Matches' Table

In [10]:
#for current world cup bets
url = 'http://www.oddsportal.com/soccer/world/world-cup-2018/'
browser.get(url)


page_content = BeautifulSoup(browser.page_source)

page_title = page_content.title.text
# print(page_title)

world_cup = page_title.split(' ')[2] #3rd elem contains year
# print(world_cup)

#find odds table
odds_table = page_content.find(attrs={'id':'tournamentTable'})

odds_table

# #participating countries
all_participants = [participant.get_text().split('-') for participant in odds_table.select('.table-participant')]
home = []
away = []
for i in range(len(all_participants)):
    home.append(all_participants[i][0].strip())
    away.append(all_participants[i][1].strip())

#get scores of matches
# score = [score.get_text() for score in odds_table.select('.table-score')]



# 1, X and 2 odds
#"B's" column indicates number of bookmakers offering World Cup 2018 betting odds on a specific soccer match. 
#Columns 1, X and 2 serve for average/biggest World Cup 2018 betting odds offered on home team to win, draw and 
#away team to win the World Cup 2018 match. 

all_bets = [bets.get_text() for bets in odds_table.select('.odds-nowrp')]
all_bets
ones = []
cross = []
twos = []
for i in range(0,len(all_bets),3):
    ones.append(all_bets[i])
    cross.append(all_bets[i+1])
    twos.append(all_bets[i+2])

year_table = pd.DataFrame({'year':world_cup,
                          'home': home,
                          'away':away,
                          'ones':ones,
                           'cross':cross,
                          'twos':twos})

year_table.head()

World Cup 2018 Betting Odds, Soccer World
2018


Unnamed: 0,away,cross,home,ones,twos,year
0,Tunisia,4.99,Belgium,1.36,11.11,2018
1,Mexico,3.61,South Korea,5.76,1.75,2018
2,Sweden,4.32,Germany,1.57,6.43,2018
3,Panama,6.21,England,1.23,17.41,2018
4,Senegal,3.16,Japan,3.52,2.32,2018
5,Colombia,3.41,Poland,3.34,2.27,2018
6,Egypt,3.59,Saudi Arabia,4.6,1.88,2018
7,Russia,3.08,Uruguay,2.66,3.01,2018
8,Portugal,3.69,Iran,6.01,1.68,2018
9,Morocco,5.02,Spain,1.34,11.37,2018


In [11]:
#some more info about home countries
year_table['home'].unique()

array(['Belgium', 'South Korea', 'Germany', 'England', 'Japan', 'Poland',
       'Saudi Arabia', 'Uruguay', 'Iran', 'Spain', 'Australia', 'Denmark',
       'Iceland', 'Nigeria', 'Mexico', 'Serbia', 'Switzerland', 'Senegal',
       'Panama'], dtype=object)

In [13]:
#save as csv 
worldcup_odds = worldcup_data.append(year_table)
worldcup_odds.to_csv('historical_odds_230618.csv', index=False)

Last updated: 24 June 2018, by Geowynn