-
Notifications
You must be signed in to change notification settings - Fork 2
/
polish_sentence.py
21 lines (17 loc) · 1.04 KB
/
polish_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# -*- coding: utf-8 -*-
import re
import HTMLParser
def polish_sentence( sentence ):
p = HTMLParser.HTMLParser()
sentence = p.unescape(unicode(sentence, "utf-8"))
sentence = re.sub(u'\n','', sentence)
sentence = re.sub(u'<[^>]*>nt','', sentence)
sentence = re.sub(u'<[^>]*>','', sentence)
sentence = re.sub(u'\[[a-z\_]*embed:.*\]','', sentence)
sentence = re.sub(u'\[video:.*\]','', sentence)
sentence = re.sub(u'[\.\[\]\?\,\(\)\!\"\'\\/\:\-]',' ', sentence)
sentence = re.sub(u'[ ]+',' ', sentence)
sentence = re.sub(u'%[0-9][a-zA-Z-0-9]', ' ',sentence)
return sentence
str = '<p>ntEl DT de <a href="http://www.bocajuniors.com.ar/home/sitio">Boca</a>, <a href="http://www.tn.com.ar/tags/julio-falcioni">Julio César</a><strong><a href="http://www.tn.com.ar/tags/julio-falcioni"> Falcioni</a>, aseguró que si <a href="http://www.tn.com.ar/tags/juan-roman-riquelme">Román Riquelme</a> "está bien físicamente, va a ser fundamental" </strong>para el equipo.</p>'
print polish_sentence(str)