## Topics of hateful articles

En esta notebook veremos los tópicos de los artículos que generan cierto odio

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from mongoengine import connect
from hatespeech_models import Tweet, Article

client = connect("hatespeech-labelling")
db = client["hatespeech-labelling"]

In [2]:

initial_query = {
    "comments__19__exists": True,
}
articles = Article.objects(**initial_query).as_pymongo()
articles = list(articles)

print(len(articles))


6669


In [3]:
for article in articles:
    hateful_comments = [c for c in article["comments"] if c["hateful_value"] > 0.5]
    
    article["num_hateful_comments"] = len(hateful_comments)
    article["avg_hateful_comments"] = len(hateful_comments) / len(article["comments"])
    article["avg_hate_value"] = sum(c["hateful_value"] for c in article["comments"]) / len(article["comments"])


In [4]:
hateful_articles = [art for art in articles if art["avg_hateful_comments"] > 0.13]
len(hateful_articles)

3520

In [5]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
" - ".join(stop_words)

'de - la - que - el - en - y - a - los - del - se - las - por - un - para - con - no - una - su - al - lo - como - más - pero - sus - le - ya - o - este - sí - porque - esta - entre - cuando - muy - sin - sobre - también - me - hasta - hay - donde - quien - desde - todo - nos - durante - todos - uno - les - ni - contra - otros - ese - eso - ante - ellos - e - esto - mí - antes - algunos - qué - unos - yo - otro - otras - otra - él - tanto - esa - estos - mucho - quienes - nada - muchos - cual - poco - ella - estar - estas - algunas - algo - nosotros - mi - mis - tú - te - ti - tu - tus - ellas - nosotras - vosotros - vosotras - os - mío - mía - míos - mías - tuyo - tuya - tuyos - tuyas - suyo - suya - suyos - suyas - nuestro - nuestra - nuestros - nuestras - vuestro - vuestra - vuestros - vuestras - esos - esas - estoy - estás - está - estamos - estáis - están - esté - estés - estemos - estéis - estén - estaré - estarás - estará - estaremos - estaréis - estarán - estaría - estarías - e

## Tokenizar

In [6]:
import spacy

nlp = spacy.load("es", disable=["parser", "ner"])

l = list(nlp("Hola viejas de re mil mierdas, las odio profundamente desde mi corazón"))

lemmatized = [t.lemma_ for t in l]

In [7]:
from tqdm.auto import tqdm

lemmatized_data = []

for art in tqdm(hateful_articles):
    lemmatized_data.append([t.lemma_.lower() for t in nlp(art["body"])])


HBox(children=(FloatProgress(value=0.0, max=3520.0), HTML(value='')))




In [8]:
import string

def filter_words(text):
    non_stop = [tok for tok in text if tok not in stop_words]
    non_punct = [tok for tok in text if tok not in string.punctuation and "\n" not in tok]
    return non_punct

filtered_data = []

for text in tqdm(lemmatized_data):
    filtered_data.append(filter_words(text))

HBox(children=(FloatProgress(value=0.0, max=3520.0), HTML(value='')))




In [9]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(filtered_data)

# Create Corpus
texts = filtered_data

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 2), (15, 2), (16, 2), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 10), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 4), (46, 4), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 5), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 4), (71, 1), (72, 12), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 14), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 1), (100, 1), (101, 4), (102, 48), (103, 1), (104, 5), (105, 1), (106, 1), (107, 1), (108, 8), (109, 1), (110

In [None]:
import gensim

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [61]:
from pprint import pprint

pprint(lda_model.print_topics())

[(6,
  '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + '
  '0.000*"zhaoxu" + 0.000*"pelotero" + 0.000*"salta.-" + 0.000*"11:59" + '
  '0.000*"hiperdesarrollo" + 0.000*"poder-" + 0.000*"prodemocracia"'),
 (0,
  '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + '
  '0.000*"zhaoxu" + 0.000*"pelotero" + 0.000*"salta.-" + 0.000*"11:59" + '
  '0.000*"hiperdesarrollo" + 0.000*"poder-" + 0.000*"prodemocracia"'),
 (29,
  '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + '
  '0.000*"zhaoxu" + 0.000*"pelotero" + 0.000*"salta.-" + 0.000*"11:59" + '
  '0.000*"hiperdesarrollo" + 0.000*"poder-" + 0.000*"prodemocracia"'),
 (10,
  '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + '
  '0.000*"zhaoxu" + 0.000*"pelotero" + 0.000*"salta.-" + 0.000*"11:59" + '
  '0.000*"hiperdesarrollo" + 0.000*"poder-" + 0.000*"prodemocracia"'),
 (13,
  '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + '
  '0.000*"zhaoxu" + 0.000*"pelotero" +

In [56]:
# Print the Keyword in the 10 topics

doc_lda = lda_model[corpus]

[(0, '0.000*"teleconferencia-" + 0.000*"compulsar" + 0.000*"xiaowei" + 0.000*"zhaoxu" + 0.000*"pelotero" + 0.000*"salta.-" + 0.000*"11:59" + 0.000*"hiperdesarrollo" + 0.000*"poder-" + 0.000*"prodemocracia"'), (1, '0.076*"de" + 0.046*"lo" + 0.039*"el" + 0.039*"en" + 0.019*"que" + 0.016*"país" + 0.015*"haber" + 0.015*"uno" + 0.014*"estados" + 0.013*"unidos"'), (2, '0.051*"de" + 0.050*"su" + 0.034*"lo" + 0.033*"a" + 0.029*"en" + 0.029*"y" + 0.028*"el" + 0.019*"me" + 0.018*"uno" + 0.018*"con"'), (3, '0.134*"tinelli" + 0.088*"marcelo" + 0.031*"guillermina" + 0.031*"cande" + 0.023*"candelaria" + 0.022*"valdés" + 0.018*"esquel" + 0.017*"showmatch" + 0.009*"valdes" + 0.006*"cordillera"'), (4, '0.125*"de" + 0.075*"lo" + 0.043*"en" + 0.039*"el" + 0.028*"y" + 0.021*"del" + 0.018*"los" + 0.018*"a" + 0.018*"se" + 0.016*"la"'), (5, '0.077*"de" + 0.064*"lo" + 0.036*"el" + 0.030*"a" + 0.028*"en" + 0.024*"que" + 0.019*"por" + 0.019*"uno" + 0.015*"y" + 0.014*"se"'), (6, '0.000*"teleconferencia-" + 0.000

In [69]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
vis



TypeError: Object of type 'complex' is not JSON serializable

PreparedData(topic_coordinates=                        x                   y  topics  cluster       Freq
topic                                                                    
22    -0.395789+0.000000j  0.030037+0.000000j       1        1  18.877346
12    -0.398210+0.000000j  0.004294+0.000000j       2        1  15.963403
25    -0.381677+0.000000j  0.023587+0.000000j       3        1  14.850221
21    -0.395913+0.000000j  0.013633+0.000000j       4        1  13.810974
9     -0.400902+0.000000j  0.004682+0.000000j       5        1   9.255079
2     -0.388636+0.000000j  0.033003+0.000000j       6        1   7.913988
26    -0.385497+0.000000j  0.038253+0.000000j       7        1   7.448424
4     -0.351758+0.000000j -0.006329+0.000000j       8        1   3.944589
8     -0.349049+0.000000j  0.016838+0.000000j       9        1   3.796279
5     -0.336327+0.000000j -0.015059+0.000000j      10        1   2.945794
18     0.015992+0.000000j -0.433817+0.000000j      11        1   0.488607
1      

In [68]:
pyLDAvis.display(vis)

TypeError: Object of type 'complex' is not JSON serializable