/
scraping_template.py
85 lines (65 loc) · 2.73 KB
/
scraping_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from bs4 import BeautifulSoup
import requests
import csv
import copy
"""
***** Traversy Media Scraping Methods NOTES ******
# find_all() and findAll() are equivalent
el = soup.find_all('div')[1] #findAll() does same thing
el = soup.find(id='section-1')
el = soup.find(class_='items')
el = soup.find(attrs={"data-hello": "hi"})
# select - similar to JQuery
el = soup.select('#section-1')[0] // id, select always returns a list
el = soup.select('.item')[0] // class
# get_text()
el = soup.find(class_='item').get_text()
for item in soup.select('.item'):
print(item.get_text())
# Navigation
el = soup.body.contents[1].contents[1].find_next_sibling() # line break is the 0th element
el = soup.find(id='section=2').find_previous_sibling()
el = soup.find(class_='item').find_parent()
el = soup.find('hs').find_next_sibling('p')
================================================================================================
***** SCRAPING ON A TEST FILE (see simple.html) ******
with open('simple.html') as html_file:
soup = BeautifulSoup(html_file, 'lxml')
print(soup.prettify()) # indents
match = soup.title.text
find_all returns list of all tags that match element
for article in soup.find_all('div', class_='article'): # _ because class is special keyword
headline = article.h2.a.text
print(headline)
summary = article.p.text
print(summary)
print()
================================================================================================
================================================================================================
******************** OG SCRAPING TEMPLATE **************************
takes in a url and other info and creates csv file
url, file_name (.csv), container_tag, and container_class are Strings,
**kwargs are key=value pairs, where keys are column names and values
are targeted Strings in the HTML (should begin with a c.find(...))
"""
def og_scrape(url, file_name, container_class, container_tag='div', **kwargs):
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')
if (container_class is None):
containers = soup.find_all(container_tag)
else:
containers = soup.find_all(container_tag, class_=container_class)
csv_file = open(file_name, 'w') #second arg is "write"
csv_writer = csv.writer(csv_file)
csv_writer.writerow(list(kwargs.keys()))
for c in containers:
targeted_values = copy.deepcopy(kwargs)
for key, value in targeted_values.items():
try:
print(eval(value))
targeted_values[key] = eval(value)
except Exception as e:
targeted_values[key] = None
print()
csv_writer.writerow(list(targeted_values.values()))
csv_file.close()