Permalink
Browse files

updated scrapers for 2010 and refactoring

  • Loading branch information...
1 parent fe74d81 commit e82fa96398585914f50a3052abcfa6ab76f9ad37 @dwillis committed Aug 11, 2010
Showing with 81 additions and 55 deletions.
  1. +51 −30 scrapers/games.py
  2. +13 −1 scrapers/main.py
  3. +17 −24 scrapers/teams.py
View
81 scrapers/games.py
@@ -22,12 +22,12 @@ def game_updater(year, teams, week, nostats=False):
>>> game_updater(2010, teams, 12)
"""
if not teams:
- teams = College.objects.filter(updated=True).order_by('id')
+ teams = CollegeYear.objects.filter(season=year, college__updated=True).order_by('id')
games = []
for team in teams:
- url = "http://web1.ncaa.org/football/exec/rankingSummary?org=%s&year=%s&week=%s" % (team.id, year, week)
+ url = "http://web1.ncaa.org/football/exec/rankingSummary?org=%s&year=%s&week=%s" % (team.college.id, year, week)
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
try:
@@ -57,22 +57,24 @@ def game_updater(year, teams, week, nostats=False):
t2 = int(row.findAll('td')[1].find('a')['href'].split('=')[1].split('&')[0])
try:
if t2 == 115: # hack job to cover for ncaa change
- team2 = College.objects.get(id=30416)
+ team2 = CollegeYear.objects.get(college__id=30416, season=year)
elif t2 == 357: # another one like the above - Lincoln Univ. PA
- team2 = College.objects.get(id=30417)
+ team2 = CollegeYear.objects.get(college__id=30417, season=year)
else:
- team2 = College.objects.get(id=t2)
+ team2 = CollegeYear.objects.get(college__id=t2, season=year)
except:
name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
slug = slugify(name)
- team2, created = College.objects.get_or_create(name=name, slug=slug)
+ new_college, created = College.objects.get_or_create(name=name, slug=slug)
+ team2 = CollegeYear.objects.get_or_create(college=new_college, season=year)
except:
- # handle blank rows
- if row.findAll('td')[1].contents == []:
+ if len(row.findAll('td')[1].contents) > 0 and row.findAll('td')[1].contents[0] != '':
+ name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
+ slug = slugify(name)
+ new_college, created = College.objects.get_or_create(name=name, slug=slug)
+ team2, created = CollegeYear.objects.get_or_create(college=new_college, season=year)
+ else:
continue
- name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
- slug = slugify(name)
- team2, created = College.objects.get_or_create(name=name, slug=slug)
print team, team2, date, team1_score, team2_score, t1_result
g, new_game = Game.objects.get_or_create(season=year, team1=team, team2=team2, date=date)
g.team1_score = team1_score
@@ -143,44 +145,63 @@ def load_ncaa_game_xml(game):
try:
print "trying game # %s: %s-%s" % (game.id, soup.teams.home.orgid.contents[0], soup.teams.visitor.orgid.contents[0])
try:
- t1 = College.objects.get(id = int(soup.teams.home.orgid.contents[0]))
+ c1 = College.objects.get(id = int(soup.teams.home.orgid.contents[0]))
+ t1, created = CollegeYear.objects.get_or_create(college=c1, year=game.season)
except College.DoesNotExist:
if soup.teams.home.orgid.contents[0] == '505632':
- t1 = College.objects.get(id=30647)
+ c1 = College.objects.get(id=30647)
+ t1, created = CollegeYear.objects.get_or_create(college=c1, season=game.season)
if soup.teams.visitor.orgid.contents[0] == '506027':
- t2 = College.objects.get(id=30504) # special case for ncaa error on southern oregon
+ c2 = College.objects.get(college__id=30504) # special case for ncaa error on southern oregon
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '505632':
- t2 = College.objects.get(id=30505)
+ c2 = College.objects.get(id=30505)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506123':
- t2 = College.objects.get(id=30506)
+ c2 = College.objects.get(id=30506)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '500405':
- t2 = College.objects.get(id=30513)
+ c2 = College.objects.get(id=30513)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '30077':
- t2 = College.objects.get(id=1083)
+ c2 = College.objects.get(id=1083)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506112':
- t2 = College.objects.get(id=30514)
+ c2 = College.objects.get(id=30514)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '501982':
- t2 = College.objects.get(id=30510)
+ c2 = College.objects.get(id=30510)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '505632':
- t2 = College.objects.get(id=30647)
+ c2 = College.objects.get(id=30647)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506116':
- t2 = College.objects.get(id=30509)
+ c2 = College.objects.get(id=30509)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506037':
- t2 = College.objects.get(id=30636)
+ c2 = College.objects.get(id=30636)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506083':
- t2 = College.objects.get(id=30488)
+ c2 = College.objects.get(id=30488)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '506105':
- t2 = College.objects.get(id=30635)
+ c2 = College.objects.get(id=30635)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '505260':
- t2 = College.objects.get(id=30515)
+ c2 = College.objects.get(id=30515)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '504135':
- t2 = College.objects.get(id=30561)
+ c2 = College.objects.get(id=30561)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '501555':
- t2 = College.objects.get(id=30432)
+ c2 = College.objects.get(id=30432)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
elif soup.teams.visitor.orgid.contents[0] == '115':
- t2 = College.objects.get(id=30416)
+ c2 = College.objects.get(id=30416)
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
else:
- t2 = College.objects.get(id = int(soup.teams.visitor.orgid.contents[0]))
+ c2 = College.objects.get(id = int(soup.teams.visitor.orgid.contents[0]))
+ t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
d = strptime(soup.gamedate.contents[0], "%m/%d/%y")
gd = datetime.date(d[0], d[1], d[2])
except:
View
14 scrapers/main.py
@@ -1,5 +1,5 @@
from fumblerooski.college.models import College
-from fumblerooski.utils import update_college_year
+from fumblerooski.utils import *
from fumblerooski.scrapers.games import game_updater
def full_load(year, week):
@@ -25,4 +25,16 @@ def partial_loader(year, id, week):
teams = College.objects.filter(updated=True, id__gte=id).order_by('id')
game_updater(year, teams, week)
+def prepare_new_season(year):
+ add_college_years(year)
+ update_conference_membership(year)
+ game_updater(year, None, 15)
+ create_weeks(year)
+ game_weeks(year)
+ update_conf_games(year)
+ games = Game.objects.filter(season=year, coach1__isnull=True, coach2__isnull=True)
+ for game in games:
+ populate_head_coaches(game)
+
+
View
41 scrapers/teams.py
@@ -7,25 +7,17 @@
import time
from BeautifulSoup import BeautifulSoup
from fumblerooski.college.models import College, Game, CollegeYear, Player, Position
-
-def create_teams(year):
- """
- Scrapes basic team information and creates College instances. Used most often to populate an empty db.
- >>> create_teams(2010)
- """
-
-
def load_skeds(year, teams):
"""
Loads the game schedules for teams for a given year. Defaults to all teams where updated = True,
but can be passed in a selection of teams.
- >>> teams = College.objects.filter(id__IN=(123,345,435))
+ >>> teams = CollegeYear.objects.filter(college__id__IN=(123,345,435))
>>> load_skeds(2009, teams)
"""
if not teams:
- teams = College.objects.filter(updated=True).order_by('id')
+ teams = CollegeYear.objects.filter(college__updated=True).order_by('id')
for team in teams:
url = "http://web1.ncaa.org/football/exec/rankingSummary?year=%s&org=%s" % (year, team.id)
@@ -39,15 +31,17 @@ def load_skeds(year, teams):
try:
t2 = int(row.findAll('td')[2].find('a')['href'].split('=')[1].split('&')[0])
try:
- team2 = College.objects.get(id=t2)
+ team2 = CollegeYear.objects.get(college__id=t2, year=year)
except:
name = row.findAll('td')[2].find('a').contents[0].strip()
slug = row.findAll('td')[2].find('a').contents[0].replace(' ','-').replace(',','').replace('.','').replace(')','').replace('(','').replace("'","").lower().strip()
- team2, created = College.objects.get_or_create(name=name, slug=slug)
+ c2, created = College.objects.get_or_create(name=name, slug=slug)
+ team2, created = CollegeYear.objects.get_or_create(college=c2, year=year)
except:
name = row.findAll('td')[2].contents[0].strip()
slug = row.findAll('td')[2].contents[0].replace(' ','-').replace(',','').replace('.','').replace(')','').replace('(','').lower().strip()
- team2, created = College.objects.get_or_create(name=name, slug=slug)
+ c2, created = College.objects.get_or_create(name=name, slug=slug)
+ team2, created = CollegeYear.objects.get_or_create(college=c2, year=year)
g, new_game = Game.objects.get_or_create(season=year, team1=team, team2=team2, date=date)
if "@" in row.findAll('td')[1].find('a').contents[0]:
g.t1_game_type = 'A'
@@ -74,19 +68,18 @@ def load_team(team_id, year):
and also gets/creates individual Player objects and updates with the number of games played.
>>> load_team(235, 2009)
"""
- team = College.objects.get(id=team_id)
- url = "http://web1.ncaa.org/football/exec/roster?year=%s&org=%s" % (year, team.id)
+ team = CollegeYear.objects.get(college__id=team_id, season=year)
+ url = "http://web1.ncaa.org/football/exec/roster?year=%s&org=%s" % (year, team_id)
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
try:
classes = soup.find("th").contents[0].split(":")[1].split(',') # retrieve class numbers for team
fr, so, jr, sr = [int(c.strip()[0:2]) for c in classes] # assign class numbers
- t, created = CollegeYear.objects.get_or_create(college=team, year=year)
- t.freshmen = fr
- t.sophomores = so
- t.juniors = jr
- t.seniors = sr
- t.save()
+ team.freshmen = fr
+ team.sophomores = so
+ team.juniors = jr
+ team.seniors = sr
+ team.save()
rows = soup.findAll("tr")[5:]
for row in rows:
cells = row.findAll("td")
@@ -98,9 +91,9 @@ def load_team(team_id, year):
pos, created = Position.objects.get_or_create(abbrev=cells[2].contents[0].strip())
cl = cells[3].contents[0].strip()
gp = int(cells[4].contents[0].strip())
- py, created = Player.objects.get_or_create(name=name, slug=name.lower().replace(' ','-').replace('.','').replace("'","-"), team=team, year=year, position=pos, number=unif, status=cl)
+ py, created = Player.objects.get_or_create(name=name, slug=name.lower().replace(' ','-').replace('.','').replace("'","-"), team=team, season=year, position=pos, number=unif, status=cl)
py.games_played=gp
py.save()
except:
- team.updated = False
- team.save()
+ team.college.updated = False
+ team.college.save()

0 comments on commit e82fa96

Please sign in to comment.