# Final exercise

1. Load the data from r/AmItheAsshole
2. Create a dictionary with the id as key and the text as value of each submission (the first 500)
3. Get the entities of each text in a dictionary with the submission id as key and the entities in a list as value
4. Create a dictionary id as key and sentiment analysis (using NaiveBaiesAnalyzer) as value (first 50)
5. Apply gensim's LDA topic model
6. Check the TF-IDF value of a few words in a few texts

### Solution

In [None]:
import json
import spacy

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from spacytextblob.spacytextblob import SpacyTextBlob 
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

from collections import Counter
import numpy as np

##### Loading the data

In [None]:
with open('data/amitheasshole_corpus.json', 'r', encoding='UTF-8') as f:
    data = json.load(f)

In [None]:
print(data[0])

##### Creating the dictionary with the corpus

In [None]:
corpus = {}
for sub in data[:500]:
    corpus[sub['id']] = sub['selftext'].replace('\n',' ')

##### Extracting the entities

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
entities = {}
for ID,text in corpus.items():
    
    doc = nlp(text)
    
    ent_list = []
    for ent in doc.ents:
        ent_list.append((ent.text,ent.label_))
        
    entities[ID] = ent_list

In [None]:
print(entities)

##### Performing a topic model

In [None]:
# turning the texts into tokens:
tokenized_corpus = {}

for ID,text in corpus.items():
        
    # create the spacy doc object
    doc = nlp(text)

    # filtering tokens and lemmatizing
    proc_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num:
            proc_text.append(token.lemma_.lower())

    tokenized_corpus[ID] = proc_text

In [None]:
# turning the corpus into bag of words
# mapping words to ids
words_id = corpora.Dictionary(tokenized_corpus.values())

# corpus becomes a bag of words
corpus_lda = [words_id.doc2bow(txt) for txt in tokenized_corpus.values()]

In [None]:
# checking coherence score

# checking "optimal" number of topics
k_init = 5
k_final = 15
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    
    # let's compute perplexity (lower) and coherence score (higher)
    per_lda = lda_model.log_perplexity(corpus_lda)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus.values(), dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(k,per_lda,coherence_lda)

In [None]:
# running lda model for number of topics with highest cohrence score
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=words_id,
                                           num_topics=12, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)

In [None]:
# composition
lda_model.show_topics(num_words=20,num_topics=12)

##### Sentiment analysis

In [None]:
# adding to NLP pipeline
nlp.add_pipe('spacytextblob')

In [None]:
# checking the pipeline
nlp.pipeline

In [None]:
sent_dict = {}
for ID in list(corpus.keys())[:50]:
    
    blob = TextBlob(corpus[ID], analyzer=NaiveBayesAnalyzer())

    sent_dict[ID] = blob.sentiment

In [None]:
print(sent_dict)

##### Calculating TF-IDF

In [None]:
# getting the vocabulary of the corpus
word_list = [token for text in tokenized_corpus.values() for token in text]
word_set = set(word_list)

In [None]:
# vocabulary size and number of texts
vocab_size = len(word_set)
n_texts = len(tokenized_corpus)  # it should be 500

In [None]:
print(vocab_size, n_texts)

In [None]:
# counting frequency of words
word_count = Counter(word_list)

In [None]:
# calculating Term Frequency (TF)
def termFreq(word,text):
    
    N = len(text)
    F = Counter(text)[word]
    
    return F/N

In [None]:
# calculating Inverse Document Frequency (IDF)
IDF_dict = {}

for word in word_set:
    count = 0
    for text in tokenized_corpus.values():
        if word in text:
            count += 1
            
    IDF_dict[word] = np.log(n_texts/count)   

In [None]:
# checking the TF-IDF of a word in a text
for ID,text in tokenized_corpus.items():
    if 'car' in text:
        print(corpus[ID])
        print(termFreq('car',text)/IDF_dict['car'])
        print('')