-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_extraction_part_1_version_1.1.py
171 lines (129 loc) · 5.03 KB
/
data_extraction_part_1_version_1.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: GRSP
#
# Created: 30-05-2013
# Copyright: (c) GRSP 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
from bs4 import BeautifulSoup
import requests
import csv
import collections
import sys
import os
import traceback
import unicodedata
# Extracts match details for all teams for all years.
def fetch_all_teams(soup):
teamsList=dict()
print '"""""""""""""""""""""""Fetching All Teams""""""""""""""""""""""""""""""'
teams=soup.find('ul',{"class":"team-links"})
for li in teams.findAll('li'):
team= li.find('a')['href']
## print team
if(team.strip()[-1]=="/"):
teamlink='http://www.11v11.com'+team.strip()+'tab/matches/'
else:
teamlink='http://www.11v11.com'+team.strip()+'/tab/matches/'
tm=li.find('a').text
teamsList[tm]=teamlink
teamsList = collections.OrderedDict(sorted(teamsList.items()))
## print teamsList
## sys.exit()
## print teamsList
return teamsList
def fetch_all_years(teamsoup):
yearList=dict()
## print teamsoup
## teamhtml = requests.get("http://www.11v11.com/teams/aston-villa/tab/matches").text
## teamsoup = BeautifulSoup(teamhtml)
seasons=teamsoup.find('ul',{"id":"season"})
## print seasons
for li in seasons.findAll('li'):
yearlink = li.find('a')['href']
## print yearlink
yr=li.find('a').text
yearList[yr]=yearlink
yearList = collections.OrderedDict(sorted(yearList.items()))
## print yearList
## sys.exit()
return yearList
def fetch_all_matches(yearsoup):
matchlist=dict()
table=yearsoup.find("table","width580 sortable")
try:
matches = table.findAll('tr')
for tr in range(1,len(matches)):
cols = matches[tr].findAll('td')
dt=cols[0].text.strip()
match=cols[1].find('a').text.strip()
matchlink= 'http://www.11v11.com'+cols[1].find("a")['href']
result=cols[2].find('span').text.strip()
score=cols[3].text.strip().split("-")
league= cols[4].text.strip()
matchlist[match+dt]=[dt,match,result,score[0]+' and '+score[1],league,matchlink]
except:
print 'NO MATCHES.fetching match list error.'
return matchlist
def main():
allmatches=[]
newpath = r'Results'
if not os.path.exists(newpath): os.makedirs(newpath)
url='http://www.11v11.com/premier-league/'
html = requests.get(url).text
soup = BeautifulSoup(html)
#fetching all teams
teamsList=fetch_all_teams(soup)
fp = open('All_Matches.csv',"wb")
cot=csv.writer(fp)
cot.writerow(['Date', 'Match', 'Result', "Score", 'League', 'MatchLink'])
# fetching all years for a team
print '"""""""""""""""""""""""""""""Fetching all years""""""""""""""""""""""""""""""""""""'
for team in teamsList.keys():
print "Team: ",team
teamhtml = requests.get(teamsList[team]).text
## print teamsList[team]
## print "Team Html: ",teamsList[team]
teamsoup = BeautifulSoup(teamhtml)
yearList=fetch_all_years(teamsoup)
# fetching matches per year of team
print '"""""""""""""""""""""""""""""Fetching all matches for year""""""""""""""""""""""""""""""""""""'
for year in reversed(yearList.keys()):
print team,' : ',year
fp2 = open(newpath+'\\'+team+'_'+year+'_All_Matches.csv',"wb")
cot2=csv.writer(fp2)
cot2.writerow(['Date', 'Match', 'Result', "Score", 'League', 'MatchLink'])
yearhtml = requests.get(yearList[year]).text
yearsoup = BeautifulSoup(yearhtml)
matchList=fetch_all_matches(yearsoup)
if(len(matchList.keys())>0):
for match in matchList.keys():
try:
if(match not in allmatches):
cot.writerow(matchList[match])
allmatches.append(match)
cot2.writerow(matchList[match])
else:
cot2.writerow(matchList[match])
except Exception,e:
print traceback.print_exc()
matchList[match][1] = unicodedata.normalize('NFKD', matchList[match][1]).encode('ascii','ignore')
matchList[match][5] = unicodedata.normalize('NFKD', matchList[match][5]).encode('ascii','ignore')
print matchList[match]
if(match not in allmatches):
cot.writerow(matchList[match])
allmatches.append(match)
cot2.writerow(matchList[match])
else:
cot2.writerow(matchList[match])
## sys.exit()
else:
print 'No Matches for this year.'
fp2.close()
fp.close()
print 'Code Execution Finished.'
if __name__ == '__main__':
main()