In [1]:
import spacy, nltk, gensim, sklearn
import pandas as pd
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases


In [2]:
def create_corpus(quotes):
    #Putting all the quotes in one corpus
    text = ""

    for quote in quotes:
        text = text + ' ' + quote

    #Removing the new lines
    text = " ".join(text.split())
    return text


In [3]:
year = 2017 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + f'quotes-{year}-filtered.json.bz2'
CHUNK_SIZE = 250

reader = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', chunksize=CHUNK_SIZE, typ='frame')

chunks = [] #utile pour plus loins quand on fait le feature extraction

for i, chunk in enumerate(reader):
    chunks.append(create_corpus(chunk.quotation))
    if i == 1000:
        break

print(len(chunks))

1001


In [4]:
nlp = spacy.load('en_core_web_sm')
#nlp.Defaults.stop_words = []

In [5]:
PATH = PATH_DATA + 'stopword_list.csv'
sw = pd.read_csv(PATH, header=None)
sw


Unnamed: 0,0,1,2,3,4
0,a's,able,about,above,according
1,accordingly,across,actually,after,afterwards
2,again,against,ain't,all,allow
3,allows,almost,alone,along,already
4,also,although,always,am,among
...,...,...,...,...,...
104,why,will,willing,wish,with
105,within,without,won't,wonder,would
106,wouldn't,yes,yet,you,you'd
107,you'll,you're,you've,your,yours


In [6]:
#loading default spacy stopword list
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

#creating new stopword list
sw_list = list(sw[0]) + list(sw[1]) + list(sw[2]) + list(sw[3]) + list(sw[4]) + list(spacy_stopwords)
nlp.Defaults.stop_words = sw_list


#creating a spacy object
sw_list


["a's",
 'accordingly',
 'again',
 'allows',
 'also',
 'amongst',
 'anybody',
 'anyways',
 'appropriate',
 'aside',
 'available',
 'because',
 'before',
 'below',
 'between',
 'by',
 "can't",
 'certain',
 'com',
 'consider',
 'corresponding',
 'definitely',
 'different',
 "don't",
 'each',
 'else',
 'et',
 'everybody',
 'exactly',
 'fifth',
 'follows',
 'four',
 'gets',
 'goes',
 'greetings',
 'has',
 'he',
 'her',
 'herein',
 'him',
 'how',
 "i'm",
 'immediate',
 'indicate',
 'instead',
 'it',
 'itself',
 'know',
 'later',
 'lest',
 'likely',
 'ltd',
 'me',
 'more',
 'must',
 'nd',
 'needs',
 'next',
 'none',
 'nothing',
 'of',
 'okay',
 'ones',
 'others',
 'ourselves',
 'own',
 'placed',
 'probably',
 'rather',
 'regarding',
 'right',
 'saying',
 'seeing',
 'seen',
 'serious',
 'she',
 'so',
 'something',
 'soon',
 'still',
 "t's",
 'th',
 'that',
 'theirs',
 'there',
 'therein',
 "they'd",
 'third',
 'though',
 'thus',
 'toward',
 'try',
 'under',
 'unto',
 'used',
 'value',
 'vs',


In [7]:
processed_docs = list()
for doc in nlp.pipe(chunks, batch_size=10):

    # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in sw_list and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_docs.append(doc)
docs = processed_docs
del processed_docs

doc 0 processed
doc 1 processed
doc 2 processed
doc 3 processed
doc 4 processed
doc 5 processed
doc 6 processed
doc 7 processed
doc 8 processed
doc 9 processed
doc 10 processed
doc 11 processed
doc 12 processed
doc 13 processed
doc 14 processed
doc 15 processed
doc 16 processed
doc 17 processed
doc 18 processed
doc 19 processed
doc 20 processed
doc 21 processed
doc 22 processed
doc 23 processed
doc 24 processed
doc 25 processed
doc 26 processed
doc 27 processed
doc 28 processed
doc 29 processed
doc 30 processed
doc 31 processed
doc 32 processed
doc 33 processed
doc 34 processed
doc 35 processed
doc 36 processed
doc 37 processed
doc 38 processed
doc 39 processed
doc 40 processed
doc 41 processed
doc 42 processed
doc 43 processed
doc 44 processed
doc 45 processed
doc 46 processed
doc 47 processed
doc 48 processed
doc 49 processed
doc 50 processed
doc 51 processed
doc 52 processed
doc 53 processed
doc 54 processed
doc 55 processed
doc 56 processed
doc 57 processed
doc 58 processed
doc 59 

In [None]:
docs = None
docs = []
for doc in processed_docs.copy():
    mini_doc = []
    for string in doc.copy():
        mini_doc.append(string)
    docs.append(mini_doc.copy())

In [16]:
save = copy.deepcopy(processed_docs)

In [31]:
# Add bigrams too

# Add bigrams to docs (only ones that appear 15 times or more).
bigram = Phrases(save, min_count=15)

for idx in range(len(save)):
    for token in bigram[save[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            save[idx].append(token)

In [32]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.


dictionary = Dictionary(save)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 1
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in save]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 96941
Number of chunks: 1001


In [33]:
# models
from gensim.models import LdaMulticore
from numpy.random import seed as random_seed

seed = 32
params = {'passes': 10, 'random_state': seed}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=10, id2word=dictionary,
                passes=params['passes'], random_state=params['random_state'])

In [34]:
#model.show_topics(num_words=5)

In [35]:
#model.show_topic(1,20)

In [36]:
sorted(model[corpus[0]],key=lambda x:x[1],reverse=True)

[(0, 0.7269185), (9, 0.15168291), (2, 0.111944765)]

In [37]:
# plot topics
import pyLDAvis.gensim_models
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)