# NER, Sentiment Analysis, and Topic Models

### NER

In [None]:
# importing the spaCy library into Python
import spacy

# loading a small language model for English; assign the result under 'nlp'
nlp = spacy.load('en_core_web_sm')

In [None]:
# let's again load our text
with open('data/am_i_the_asshole_example.txt', 'r', encoding='UTF-8') as f:
    text = f.read()

doc = nlp(text)

In [None]:
for ent in doc.ents:
    print(ent.text,ent.label_)

In [None]:
spacy.explain('NORP')

If you need to adapt your NER model, check it out [here](https://spacy.io/api/entityrecognizer) for more info

### Sentiment Analysis

With spaCy and TextBlob. TextBlob is a Lexicon-based sentiment analyzer that contains a dictionary with scores to calculate a sentence's polarity.

In [None]:
# installing spacytextblob
!pip install spacytextblob

In [None]:
# downloading corpora
!python -m textblob.download_corpora

In [None]:
spacy.__version__

In [None]:
# checking the components if the spacy NLP pipeline
nlp.components

In [None]:
from spacytextblob.spacytextblob import SpacyTextBlob 
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [None]:
# adding to NLP pipeline
nlp.add_pipe('spacytextblob')

In [None]:
nlp.pipeline

In [None]:
doc = nlp(text.replace('\n',' '))

In [None]:
doc._.polarity

In [None]:
doc._.subjectivity

In [None]:
doc._.assessments

In [None]:
for span in doc.sents:
    print(span.text, span._.polarity)

In [None]:
for span in doc.sents:
    blob = TextBlob(span.text, analyzer=NaiveBayesAnalyzer())
    print(span, blob.sentiment)

### Topic Models

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy

import numpy as np
import pandas as pd
import os

In [None]:
# loading the English model
nlp = spacy.load('en_core_web_sm')

# we don't want to split words with apostrophe
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "'" not in key and "’" not in key and "‘" not in key}

In [None]:
path_to_folder = 'data/topic_models/'

In [None]:
path_to_files=sorted([os.path.join(path_to_folder, f) for f in os.listdir(path_to_folder)])

#### Preprocessing the corpus

In [None]:
# add as many stopwords necessary
extra_stop = ['mr','ms','mrs','hon']

In [None]:
# turning the texts into tokens:
tokenized_corpus = []

for my_file in path_to_files:
    
    with open(my_file, encoding='utf-8') as f:
        #text = f.readlines()
        #text = ''.join(text).replace('\n',' ')
        
        # turn all the lines into a single string
        #text = ''.join(text)
        text = f.read().replace('\n','')
        
        # create the spacy doc object with the text all in lowercase
        doc = nlp(text)

        # filtering tokens and lemmatizing
        text = []
        for word in doc:
            if not word.is_stop and not word.is_punct and not word.like_num and word.lemma_ not in extra_stop:
                #print(word.lemma_)
                text.append(word.lemma_.lower())
            
        tokenized_corpus.append(text)

#### Turning the corpus into bags of words

In [None]:
# mapping words to ids
words_id = corpora.Dictionary(tokenized_corpus)

# corpus becomes a bag of words
corpus = [words_id.doc2bow(txt) for txt in tokenized_corpus]

##### Checking coherence score

In [None]:
# checking "optimal" number of topics
k_init = 5
k_final = 15
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    
    # let's compute perplexity (lower) and coherence score (higher)
    per_lda = lda_model.log_perplexity(corpus)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(k,per_lda,coherence_lda)

##### Running lda model for number of topics with highest cohrence score

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=9, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)

##### Topic composition

In [None]:
lda_model.show_topics(num_words=20,num_topics=9)