-
Notifications
You must be signed in to change notification settings - Fork 1
/
elBarcoTorScraper.py
56 lines (44 loc) · 1.67 KB
/
elBarcoTorScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import asyncio
import sys
import io
from bs4 import BeautifulSoup
from torpy.http.requests import TorRequests
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='latin1')
def scraper():
grab = None
lista = ""
#global contenido
print('elBarcoTorScraper : INFO : scraping with tor...', flush=True)
# TODO set a count loop
while grab is None:
try:
with TorRequests() as tor_requests:
with tor_requests.get_session() as sess:
grab = sess.get("https://elcano.top")
print(grab)
except:
pass
soup = BeautifulSoup(grab.text, 'html.parser')
for enlace in soup.find_all('a'):
acelink = enlace.get('href')
canal = enlace.text
if not str(acelink).startswith("acestream://") or canal == "aquÃ":
pass
else:
link = str(acelink).replace("acestream://", "")
lista += str((canal + "\n" + link + "\n"))
# No esta claro que haya que hacer este replace porque la lista ya se escribe con espacios
contenido = ((lista.replace(u'\xa0', u' ')).strip())
if contenido != "":
#print(contenido, flush=True)
print("elBarcoTorScraper : INFO : channels retrieved from elBarco")
else:
print("elBarcoTorScraper : ERROR : channels could not be retrieved ")
# TODO rewrite this extremely unelegant latin1 encoding
contenido1 = contenido.replace('ç', 'ç', 1)
contenido2 = contenido1.replace('ñ', 'ñ')
contenido_lat = contenido2.replace('ó', 'ó')
with open('PATH TO canales.txt', "w") as f:
f.write(contenido_lat)
return contenido_lat
#scraper()