Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

updated scrapers for 2010 and refactoring

  • Loading branch information...
commit e82fa96398585914f50a3052abcfa6ab76f9ad37 1 parent fe74d81
Derek Willis authored August 10, 2010
81  scrapers/games.py
@@ -22,12 +22,12 @@ def game_updater(year, teams, week, nostats=False):
22 22
     >>> game_updater(2010, teams, 12)
23 23
     """
24 24
     if not teams:
25  
-        teams = College.objects.filter(updated=True).order_by('id')
  25
+        teams = CollegeYear.objects.filter(season=year, college__updated=True).order_by('id')
26 26
     
27 27
     games = []
28 28
     
29 29
     for team in teams:
30  
-        url = "http://web1.ncaa.org/football/exec/rankingSummary?org=%s&year=%s&week=%s" % (team.id, year, week)
  30
+        url = "http://web1.ncaa.org/football/exec/rankingSummary?org=%s&year=%s&week=%s" % (team.college.id, year, week)
31 31
         html = urllib.urlopen(url).read()
32 32
         soup = BeautifulSoup(html)
33 33
         try:
@@ -57,22 +57,24 @@ def game_updater(year, teams, week, nostats=False):
57 57
                     t2 = int(row.findAll('td')[1].find('a')['href'].split('=')[1].split('&')[0])
58 58
                     try:
59 59
                         if t2 == 115:   # hack job to cover for ncaa change
60  
-                            team2 = College.objects.get(id=30416)
  60
+                            team2 = CollegeYear.objects.get(college__id=30416, season=year)
61 61
                         elif t2 == 357: # another one like the above - Lincoln Univ. PA
62  
-                            team2 = College.objects.get(id=30417)
  62
+                            team2 = CollegeYear.objects.get(college__id=30417, season=year)
63 63
                         else:
64  
-                            team2 = College.objects.get(id=t2)
  64
+                            team2 = CollegeYear.objects.get(college__id=t2, season=year)
65 65
                     except:
66 66
                         name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
67 67
                         slug = slugify(name)
68  
-                        team2, created = College.objects.get_or_create(name=name, slug=slug)
  68
+                        new_college, created = College.objects.get_or_create(name=name, slug=slug)
  69
+                        team2 = CollegeYear.objects.get_or_create(college=new_college, season=year)
69 70
                 except:
70  
-                    # handle blank rows
71  
-                    if row.findAll('td')[1].contents == []:
  71
+                    if len(row.findAll('td')[1].contents) > 0 and row.findAll('td')[1].contents[0] != '':
  72
+                        name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
  73
+                        slug = slugify(name)
  74
+                        new_college, created = College.objects.get_or_create(name=name, slug=slug)
  75
+                        team2, created = CollegeYear.objects.get_or_create(college=new_college, season=year)
  76
+                    else:
72 77
                         continue
73  
-                    name = row.findAll('td')[1].contents[0].replace("*","").strip().title()
74  
-                    slug = slugify(name)
75  
-                    team2, created = College.objects.get_or_create(name=name, slug=slug)
76 78
                 print team, team2, date, team1_score, team2_score, t1_result
77 79
                 g, new_game = Game.objects.get_or_create(season=year, team1=team, team2=team2, date=date)
78 80
                 g.team1_score = team1_score
@@ -143,44 +145,63 @@ def load_ncaa_game_xml(game):
143 145
     try:
144 146
         print "trying game # %s: %s-%s" % (game.id, soup.teams.home.orgid.contents[0], soup.teams.visitor.orgid.contents[0])
145 147
         try:
146  
-            t1 = College.objects.get(id = int(soup.teams.home.orgid.contents[0]))
  148
+            c1 = College.objects.get(id = int(soup.teams.home.orgid.contents[0]))
  149
+            t1, created = CollegeYear.objects.get_or_create(college=c1, year=game.season)
147 150
         except College.DoesNotExist:
148 151
             if soup.teams.home.orgid.contents[0] == '505632':
149  
-                t1 = College.objects.get(id=30647)
  152
+                c1 = College.objects.get(id=30647)
  153
+                t1, created = CollegeYear.objects.get_or_create(college=c1, season=game.season)
150 154
         if soup.teams.visitor.orgid.contents[0] == '506027':
151  
-            t2 = College.objects.get(id=30504) # special case for ncaa error on southern oregon
  155
+            c2 = College.objects.get(college__id=30504) # special case for ncaa error on southern oregon
  156
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
152 157
         elif soup.teams.visitor.orgid.contents[0] == '505632':
153  
-            t2 = College.objects.get(id=30505)
  158
+            c2 = College.objects.get(id=30505)
  159
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
154 160
         elif soup.teams.visitor.orgid.contents[0] == '506123':
155  
-            t2 = College.objects.get(id=30506)
  161
+            c2 = College.objects.get(id=30506)
  162
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
156 163
         elif soup.teams.visitor.orgid.contents[0] == '500405':
157  
-            t2 = College.objects.get(id=30513)
  164
+            c2 = College.objects.get(id=30513)
  165
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
158 166
         elif soup.teams.visitor.orgid.contents[0] == '30077':
159  
-            t2 = College.objects.get(id=1083)
  167
+            c2 = College.objects.get(id=1083)
  168
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
160 169
         elif soup.teams.visitor.orgid.contents[0] == '506112':
161  
-            t2 = College.objects.get(id=30514)
  170
+            c2 = College.objects.get(id=30514)
  171
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
162 172
         elif soup.teams.visitor.orgid.contents[0] == '501982':
163  
-            t2 = College.objects.get(id=30510)
  173
+            c2 = College.objects.get(id=30510)
  174
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
164 175
         elif soup.teams.visitor.orgid.contents[0] == '505632':
165  
-            t2 = College.objects.get(id=30647)
  176
+            c2 = College.objects.get(id=30647)
  177
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
166 178
         elif soup.teams.visitor.orgid.contents[0] == '506116':
167  
-            t2 = College.objects.get(id=30509)
  179
+            c2 = College.objects.get(id=30509)
  180
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
168 181
         elif soup.teams.visitor.orgid.contents[0] == '506037':
169  
-            t2 = College.objects.get(id=30636)
  182
+            c2 = College.objects.get(id=30636)
  183
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
170 184
         elif soup.teams.visitor.orgid.contents[0] == '506083':
171  
-            t2 = College.objects.get(id=30488)
  185
+            c2 = College.objects.get(id=30488)
  186
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
172 187
         elif soup.teams.visitor.orgid.contents[0] == '506105':
173  
-            t2 = College.objects.get(id=30635)
  188
+            c2 = College.objects.get(id=30635)
  189
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
174 190
         elif soup.teams.visitor.orgid.contents[0] == '505260':
175  
-            t2 = College.objects.get(id=30515)
  191
+            c2 = College.objects.get(id=30515)
  192
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
176 193
         elif soup.teams.visitor.orgid.contents[0] == '504135':
177  
-            t2 = College.objects.get(id=30561)
  194
+            c2 = College.objects.get(id=30561)
  195
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
178 196
         elif soup.teams.visitor.orgid.contents[0] == '501555':
179  
-            t2 = College.objects.get(id=30432)
  197
+            c2 = College.objects.get(id=30432)
  198
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
180 199
         elif soup.teams.visitor.orgid.contents[0] == '115':
181  
-            t2 = College.objects.get(id=30416)
  200
+            c2 = College.objects.get(id=30416)
  201
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
182 202
         else:
183  
-            t2 = College.objects.get(id = int(soup.teams.visitor.orgid.contents[0]))
  203
+            c2 = College.objects.get(id = int(soup.teams.visitor.orgid.contents[0]))
  204
+            t2, created = CollegeYear.objects.get_or_create(college=c2, season=game.season)
184 205
         d = strptime(soup.gamedate.contents[0], "%m/%d/%y")
185 206
         gd = datetime.date(d[0], d[1], d[2])
186 207
     except:
14  scrapers/main.py
... ...
@@ -1,5 +1,5 @@
1 1
 from fumblerooski.college.models import College
2  
-from fumblerooski.utils import update_college_year
  2
+from fumblerooski.utils import *
3 3
 from fumblerooski.scrapers.games import game_updater
4 4
 
5 5
 def full_load(year, week):
@@ -25,4 +25,16 @@ def partial_loader(year, id, week):
25 25
     teams = College.objects.filter(updated=True, id__gte=id).order_by('id')
26 26
     game_updater(year, teams, week)
27 27
 
  28
+def prepare_new_season(year):
  29
+    add_college_years(year)
  30
+    update_conference_membership(year)
  31
+    game_updater(year, None, 15)
  32
+    create_weeks(year)
  33
+    game_weeks(year)
  34
+    update_conf_games(year)
  35
+    games = Game.objects.filter(season=year, coach1__isnull=True, coach2__isnull=True)
  36
+    for game in games:
  37
+        populate_head_coaches(game)
  38
+
  39
+
28 40
 
41  scrapers/teams.py
@@ -7,25 +7,17 @@
7 7
 import time
8 8
 from BeautifulSoup import BeautifulSoup
9 9
 from fumblerooski.college.models import College, Game, CollegeYear, Player, Position
10  
-
11  
-def create_teams(year):
12  
-    """
13  
-    Scrapes basic team information and creates College instances. Used most often to populate an empty db.
14  
-    >>> create_teams(2010)
15  
-    """
16  
-    
17  
-    
18 10
     
19 11
 
20 12
 def load_skeds(year, teams):
21 13
     """
22 14
     Loads the game schedules for teams for a given year. Defaults to all teams where updated = True,
23 15
     but can be passed in a selection of teams.
24  
-    >>> teams = College.objects.filter(id__IN=(123,345,435))
  16
+    >>> teams = CollegeYear.objects.filter(college__id__IN=(123,345,435))
25 17
     >>> load_skeds(2009, teams)
26 18
     """
27 19
     if not teams:
28  
-        teams = College.objects.filter(updated=True).order_by('id')
  20
+        teams = CollegeYear.objects.filter(college__updated=True).order_by('id')
29 21
     
30 22
     for team in teams:
31 23
         url = "http://web1.ncaa.org/football/exec/rankingSummary?year=%s&org=%s" % (year, team.id)
@@ -39,15 +31,17 @@ def load_skeds(year, teams):
39 31
             try:
40 32
                 t2 = int(row.findAll('td')[2].find('a')['href'].split('=')[1].split('&')[0])
41 33
                 try:
42  
-                    team2 = College.objects.get(id=t2)
  34
+                    team2 = CollegeYear.objects.get(college__id=t2, year=year)
43 35
                 except:
44 36
                     name = row.findAll('td')[2].find('a').contents[0].strip()
45 37
                     slug = row.findAll('td')[2].find('a').contents[0].replace(' ','-').replace(',','').replace('.','').replace(')','').replace('(','').replace("'","").lower().strip()
46  
-                    team2, created = College.objects.get_or_create(name=name, slug=slug)
  38
+                    c2, created = College.objects.get_or_create(name=name, slug=slug)
  39
+                    team2, created = CollegeYear.objects.get_or_create(college=c2, year=year)
47 40
             except:
48 41
                 name = row.findAll('td')[2].contents[0].strip()
49 42
                 slug = row.findAll('td')[2].contents[0].replace(' ','-').replace(',','').replace('.','').replace(')','').replace('(','').lower().strip()
50  
-                team2, created = College.objects.get_or_create(name=name, slug=slug)
  43
+                c2, created = College.objects.get_or_create(name=name, slug=slug)
  44
+                team2, created = CollegeYear.objects.get_or_create(college=c2, year=year)
51 45
             g, new_game = Game.objects.get_or_create(season=year, team1=team, team2=team2, date=date)
52 46
             if "@" in row.findAll('td')[1].find('a').contents[0]:
53 47
                 g.t1_game_type = 'A'
@@ -74,19 +68,18 @@ def load_team(team_id, year):
74 68
     and also gets/creates individual Player objects and updates with the number of games played.
75 69
     >>> load_team(235, 2009)
76 70
     """
77  
-    team = College.objects.get(id=team_id)
78  
-    url = "http://web1.ncaa.org/football/exec/roster?year=%s&org=%s" % (year, team.id)
  71
+    team = CollegeYear.objects.get(college__id=team_id, season=year)
  72
+    url = "http://web1.ncaa.org/football/exec/roster?year=%s&org=%s" % (year, team_id)
79 73
     html = urllib.urlopen(url).read()
80 74
     soup = BeautifulSoup(html)
81 75
     try:
82 76
         classes = soup.find("th").contents[0].split(":")[1].split(',') # retrieve class numbers for team
83 77
         fr, so, jr, sr = [int(c.strip()[0:2]) for c in classes] # assign class numbers
84  
-        t, created = CollegeYear.objects.get_or_create(college=team, year=year)
85  
-        t.freshmen = fr
86  
-        t.sophomores = so
87  
-        t.juniors = jr
88  
-        t.seniors = sr
89  
-        t.save()
  78
+        team.freshmen = fr
  79
+        team.sophomores = so
  80
+        team.juniors = jr
  81
+        team.seniors = sr
  82
+        team.save()
90 83
         rows = soup.findAll("tr")[5:]
91 84
         for row in rows:
92 85
             cells = row.findAll("td")
@@ -98,9 +91,9 @@ def load_team(team_id, year):
98 91
                 pos, created = Position.objects.get_or_create(abbrev=cells[2].contents[0].strip())
99 92
             cl = cells[3].contents[0].strip()
100 93
             gp = int(cells[4].contents[0].strip())
101  
-            py, created = Player.objects.get_or_create(name=name, slug=name.lower().replace(' ','-').replace('.','').replace("'","-"), team=team, year=year, position=pos, number=unif, status=cl)
  94
+            py, created = Player.objects.get_or_create(name=name, slug=name.lower().replace(' ','-').replace('.','').replace("'","-"), team=team, season=year, position=pos, number=unif, status=cl)
102 95
             py.games_played=gp
103 96
             py.save()
104 97
     except:
105  
-        team.updated = False
106  
-        team.save()
  98
+        team.college.updated = False
  99
+        team.college.save()

0 notes on commit e82fa96

Please sign in to comment.
Something went wrong with that request. Please try again.