This notebook explores the related topics to veganism in our dataset.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

# Supresses warnings
pd.options.mode.chained_assignment = None
sns.set_theme(style = "white")

# matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# nltk
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pyLDAvis.gensim_models
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary

# custom function
from functions import *

nltk.download('stopwords')
nltk.download('punct')
nltk.download('brown')

# dir paths
DATA_DIR = '../data'
IMG_DIR = '../img'
RAW_DATA_DIR = "../raw_data/"

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
[nltk_data] Downloading package stopwords to /home/romain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading punct: Package 'punct' not found in index
[nltk_data] Downloading package brown to /home/romain/nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Load the data

We load our dataset and preprocess using our main pipeline.

In [2]:
df = pd.read_json(f'{DATA_DIR}/quotebank_attr_2015-2020.json.bz2', compression='bz2')
preprocess_dataframe(df)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,speaker_qid,gender,nationality,date_of_birth,ethnic_group,occupation,party,academic_degree,domains
0,2015-10-05-010038,At Neath Food Festival with a vegetarian platt...,Bethan Jenkins,[Q4897688],2015-10-05 05:00:00,1,"[[Bethan Jenkins, 0.8565], [None, 0.1435]]",[http://www.southwales-eveningpost.co.uk/Town-...,E,Q4897688,,,,,,,,[southwales-eveningpost.co.uk]
1,2015-11-28-006688,"But if I had to choose my favourite recipe, it...",Trish Deseine,[Q7843986],2015-11-28 00:00:00,1,"[[Trish Deseine, 0.7664], [None, 0.1125], [Kei...",[http://www.irishexaminer.com/lifestyle/featur...,E,Q7843986,,,,,,,,[irishexaminer.com]
2,2015-11-10-015422,"Children are learning about responsibility, te...",David Price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2015-11-10 19:30:41,1,"[[David Price, 0.7849], [None, 0.2151]]",[http://news.ifas.ufl.edu/2015/11/ufifas-bok-t...,E,,,,,,,,,[ufl.edu]
3,2015-01-16-054273,Once we have added all the vegetables and stoc...,,[],2015-01-16 11:40:11,4,"[[None, 0.6778], [Catherine O'Neill, 0.3222]]",[http://www.bromsgrovestandard.co.uk/2015/01/1...,E,,,,,,,,,"[bromsgrovestandard.co.uk, bromsgrovestandard...."
4,2015-02-20-089622,vegans in a steakhouse.,Jim Gaffigan,[Q2093638],2015-02-20 00:05:44,1,"[[Jim Gaffigan, 0.7894], [None, 0.2106]]",[http://unfspinnaker.com/expect-jim-gaffigans-...,E,Q2093638,[male],[United States of America],[+1966-07-07T00:00:00Z],,"[screenwriter, film producer, television actor...",,,[unfspinnaker.com]


# Topic modelling

The goal is here to identify the main topics related to our filtered quotes.

In [3]:
# Load the trained pipeline
nlp = spacy.load('en_core_web_md')  # en_core_web_sm, en_core_web_lg

# Select the quotations
quotations = df['quotation']

Our dataset is to large (66k quotations) to be handled directly. That's why we'll use the quotations as chunks.

In [4]:
docs = []
for doc in nlp.pipe(quotations, n_process=12, batch_size=50):
    ents = doc.ents
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    doc = [token for token in doc if token not in STOP_WORDS and len(token) > 2]
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    docs.append(doc)

In [5]:
print(f"Input  : {quotations[1]}")
print(f"Output : {docs[1]}")

Input  : But if I had to choose my favourite recipe, it's the epic Beef Bourguignon with its `roughly chopped' vegetables, one large glass of brandy and `one small calf's foot. (Optional but preferable!)' .
Output : ['choose', 'favourite', 'recipe', 'epic', 'Beef', 'Bourguignon', 'roughly', 'chop', 'vegetable', 'large', 'glass', 'brandy', 'small', 'calf', 'foot', 'optional', 'preferable', 'Beef Bourguignon']


Now, let's add bigrams to the docs.

In [6]:
bigram = Phrases(docs, min_count=15)

all_bigrams = set()
for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if '_' in token:
            docs[i].append(token)
            all_bigrams.add(token)

print(f"{len(all_bigrams)} different bigrams found.")

754 different bigrams found.


In [7]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.6
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 13219
Number of chunks: 66584


In [8]:
# models
from gensim.models import LdaMulticore
params = {'passes': 10, 'random_state': 1}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=6, id2word=dictionary, workers=12,
                passes=params['passes'], random_state=params['random_state'])

In [9]:
model.show_topics(num_words=5)

[(0,
  '0.048*"vegetation" + 0.008*"water" + 0.008*"fire" + 0.007*"area" + 0.007*"tree"'),
 (1,
  '0.050*"base" + 0.049*"plant" + 0.044*"plant_base" + 0.020*"food" + 0.019*"diet"'),
 (2,
  '0.061*"vegan" + 0.030*"vegetarian" + 0.022*"eat" + 0.015*"like" + 0.013*"food"'),
 (3,
  '0.035*"vegetable" + 0.016*"grow" + 0.010*"farmer" + 0.007*"garden" + 0.007*"work"'),
 (4,
  '0.050*"vegetable" + 0.035*"fruit" + 0.027*"fruit_vegetable" + 0.016*"food" + 0.008*"product"'),
 (5,
  '0.076*"vegetable" + 0.022*"fruit" + 0.015*"eat" + 0.012*"like" + 0.011*"fruit_vegetable"')]

In [10]:
model.show_topic(1,20)

[('base', 0.05017142),
 ('plant', 0.049485866),
 ('plant_base', 0.043673504),
 ('food', 0.020369995),
 ('diet', 0.018561907),
 ('product', 0.013337391),
 ('meat', 0.013290618),
 ('protein', 0.012773569),
 ('health', 0.0070091747),
 ('consumer', 0.0065474445),
 ('vegetable', 0.0065056942),
 ('vegan', 0.006362391),
 ('eat', 0.0063002612),
 ('year', 0.005349728),
 ('animal', 0.0051920707),
 ('dairy', 0.0049777376),
 ('healthy', 0.004250421),
 ('high', 0.004197711),
 ('source', 0.003969543),
 ('benefit', 0.0038815574)]

In [11]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


We can identify several topics, such as animal protection, diets, community related, environment...