#!usr/bin/env python
from bs4 import BeautifulSoup
from dateutil import parser, rrule
from urllib2 import urlopen
import sqlite3
import re
import datetime
con = sqlite3.connect('fixtures.db')
leagues = ['premier-league', 'spanish-liga', 'german-bundesliga', 'italian-serie-a']
def scrape_fixtures(month, year, league):
"""Scrapes football fixtures from eurosport website
Month of the year you would like to retrive the
fixtures for. Must be an iteger less than or equal
to 2 digits long. E.g. 01 for January
Year you would like to retreive the fixtures for.
Must be 4 didgit integer e.g. 2012
League you would like to scrape fixtures from. Must
be a string.
A dictionairy of lists; keys 'kick off', 'home team',
'away team', 'venue'."""
matches_for_month = {'kick off': [],
'home team': [],
'away team': [],
'venue': []}
#This is kind of janky, and doesn't work for competitions with group stages
url = '' % (league, year, month)
page = urlopen(url)
soup = BeautifulSoup(
tables = soup.findAll('table', 'fixtures-results') # Makes list containing the html for each day
for table in tables:
rows ='tr', {'class': 'fixture'}) # Makes a list where each element is the html for a match
for row in rows:
# Retrives the teams, kickoff time, and venue for each match
# TODO implement using beautifulsoup instead of regex
matches_for_month['kick off'].append(parser.parse(str(year) + ' ' +'<abbr.+>(.+)</abbr>', str(row)).group(1)))
matches_for_month['home team'].append('<span class="home">(.+)</span>', str(row)).group(1))
matches_for_month['away team'].append('<span class="away">(.+)</span>', str(row)).group(1))
matches_for_month['venue'].append('<td class="venue.+">(.+)</td>', str(row)).group(1))
return matches_for_month
print 'Error no url %s' % url
with con:
cursor = con.cursor()
cursor.execute('DROP TABLE IF EXISTS fixtures_tbl;')
cursor.execute("""CREATE TABLE fixtures_tbl (
kick_off TEXT NOT NULL,
home_team VARCHAR(30) NOT NULL,
away_team VARCHAR(30) NOT NULL,
venue VARCHAR(100) NOT NULL,
league VARCHAR(30) NOT NULL,
played INTEGER NOT NULL);""")
for league in leagues:
for dt in rrule.rrule(rrule.MONTHLY, dtstart=datetime.datetime(2012, 11, 01),
until=datetime.datetime(2013, 06, 30)):
matches = scrape_fixtures(dt.month, dt.year, league)
for i in range(len(matches['home team'])):
cursor.execute("""INSERT INTO fixtures_tbl (kick_off, home_team, away_team, venue, league, played)
VALUES(?, ?, ?, ?, ?, 0);""", (unicode(str(matches['kick off'][i]), 'UTF_8'),
unicode(matches['home team'][i], 'UTF_8'),
unicode(matches['away team'][i], 'UTF_8'),
unicode(matches['venue'][i], 'UTF_8'),
unicode(league, 'UTF_8')))
print 'Janky as fuck'
