In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pickle

In [2]:
%load_ext autoreload
%autoreload 2

import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
%matplotlib inline  

seed = 42
random.seed(seed)
np.random.seed(seed)

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Load data

In [3]:
DATA_PATH = './data/data2016/'
FILE_2016 = DATA_PATH + 'M3_df_2016_complete.pkl'

In [4]:
with open(FILE_2016, 'rb') as input_file:
    df_2016 = pickle.load(input_file)

In [5]:
df_2016

Unnamed: 0,speaker,qids,quoteID,quotation,date,urls,sitenames,domain,tags,gender,citizenship,Description,date_of_birth,numOcurrences,age,age_range,Continent,quotation_length,media_country,media_country_qid
0,... Palmer,Q54597043,2016-09-13-027551,hands of the terrorists.,2016-09-13 07:53:40,[http://feeds.cbsnews.com/~r/CBSNewsMain/~3/tK...,[cbsnews],[com],[[]],Male,,,NaT,1,,,,24,[nan],[None]
1,... Palmer,Q54597043,2016-09-25-001824,a revolutionary manure dispenser.,2016-09-25 19:05:00,[http://feeds.latimes.com/~r/latimes/entertain...,[latimes],[com],[[culture]],Male,,,NaT,1,,,,33,[nan],[None]
2,... Palmer,Q54597043,2016-09-25-008261,"But most of all, we wanted to beat each other ...",2016-09-25 19:05:00,[http://feeds.latimes.com/~r/latimes/entertain...,[latimes],[com],[[culture]],Male,,,NaT,1,,,,56,[nan],[None]
3,... Palmer,Q54597043,2016-09-25-010400,Critics who have said a safer shot here or the...,2016-09-25 19:05:00,[http://feeds.latimes.com/~r/latimes/entertain...,[latimes],[com],[[culture]],Male,,,NaT,1,,,,123,[nan],[None]
4,... Palmer,Q54597043,2016-09-25-014983,going for the green in two was who I was as a ...,2016-09-25 19:05:00,[http://feeds.latimes.com/~r/latimes/entertain...,[latimes],[com],[[culture]],Male,,,NaT,1,,,,84,[nan],[None]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1958304,Özlem Cekic,Q3744442,2016-07-28-129301,What we should be doing instead is supporting ...,2016-07-28 13:25:07,[http://www.independent.co.uk/news/world/europ...,[independent],[co.uk],[[]],Female,Turkey,sovereign state straddling Southeastern Europe...,1976-05-07,1,40.0,"(30.0, 40.0]",Asia,203,[United Kingdom],[Q145]
1958305,Łukasz Grass,Q9394606,2016-05-11-136703,We focus on emerging industries and technologi...,2016-05-11 08:46:37,[http://www.politico.eu/blogs/spence-on-media/...,[politico],[eu],[[business]],Male,,,1976-02-20,1,40.0,"(30.0, 40.0]",,108,[United States of America],[Q30]
1958306,Łukasz Habaj,Q15966376,2016-08-04-020959,"Estonia opened my eyes a little wider,",2016-08-04 00:00:00,[http://au.eurosport.com/erc/polish-champion-h...,[eurosport],[com],[[]],Male,,,1975-04-29,1,41.0,"(40.0, 50.0]",,38,[nan],[None]
1958307,Łukasz Habaj,Q15966376,2016-08-05-054147,"It was at the second corner,",2016-08-05 00:00:00,[http://au.eurosport.com/erc/after-ss2-habaj-c...,[eurosport],[com],[[]],Male,,,1975-04-29,1,41.0,"(40.0, 50.0]",,28,[nan],[None]


# Choose subset of data for test

In [6]:
test_quotes = df_2016['quotation'][:100]
test_quotes

0                              hands of the terrorists.
1                     a revolutionary manure dispenser.
2     But most of all, we wanted to beat each other ...
3     Critics who have said a safer shot here or the...
4     going for the green in two was who I was as a ...
                            ...                        
95    My Auntie G money, just called me and said man...
96          What the f -- Courtney call my phone NOW!!!
97    I actually lived with my crazy roommate for th...
98    This Bitch was trying to steal something got k...
99      I don't know why people would want to cross me,
Name: quotation, Length: 100, dtype: object

# NLP Pipeline: topic detection

In [57]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [58]:
# Put in raw text, get a Spacy object
quotes = [nlp(i) for i in test_quotes]

In [59]:
quotes

[hands of the terrorists.,
 a revolutionary manure dispenser.,
 But most of all, we wanted to beat each other to a pulp.,
 Critics who have said a safer shot here or there would undoubtedly have won me a few more tournaments are probably correct,,
 going for the green in two was who I was as a boy -- and it's who I remain as a man.,
 I found out they were guys that had actually taken leave,,
 I grew up in poverty on the edge of a golf course,,
 I saw how people lived on the other side of the tracks, the upper crust and the WASPs at the country club. We had chickens and pigs in our yards. We butchered every year. I'll never forget those things.,
 It was a nice championship to place on a list of achievements but not an essential one,,
 Never has been, never will be. Golf is the most democratic game on Earth.... It punishes and exalts us all with splendid equal opportunity.,
 They weren't just given permission to come out there and do that... and it expanded from there.,
 Wake without Bud

In [60]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_quotes = list()
for quote in nlp.pipe(df_2016['quotation'][:10000]):#, n_threads=-1): #batch_size=10000): 

    # Process document using Spacy NLP pipeline.
    ents = quote.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    quote = [token.lemma_ for token in quote if token.is_alpha and not token.is_stop]
    
    # Remove common words from a stopword list and keep only words of length 3 or more.
    quote = [token for token in quote if token not in STOPWORDS and len(token) > 2]
    
    # Add named entities, but only if they are a compound of more than word.
    quote.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_quotes.append(quote)

quotes = processed_quotes
del processed_quotes

In [61]:
STOPWORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [62]:
# Add bigrams too
from gensim.models.phrases import Phrases

# Add bigrams to docs (only ones that appear 15 times or more).
bigram = Phrases(quotes, min_count=15)

for idx in range(len(quotes)):
    for token in bigram[quotes[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            quotes[idx].append(token)

In [63]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary = Dictionary(quotes)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.000001
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(quote) for quote in quotes]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 0
Number of chunks: 10000


In [64]:
# models
from gensim.models import LdaMulticore
params = {'passes': 10, 'random_state': seed}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=6, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])

ValueError: cannot compute LDA over an empty collection (no terms)

In [None]:
model.show_topics(num_words=5)

In [None]:
model.show_topic(0,20)

In [None]:
sorted(model[corpus[0]],key=lambda x:x[1],reverse=True)

In [None]:
# plot topics
data =  gensimvis.prepare(model, corpus, dictionary, R=10) #R : number of terms to display,recommended to be in (10;50)
pyLDAvis.display(data)

In [18]:
# assignment
sent_to_cluster = list()
for n, quote in enumerate(corpus):
    if quote:
        cluster = max(model[quote],key=lambda x:x[1])
        sent_to_cluster.append(cluster[0])

In [19]:
# accuracy
from collections import Counter
for book, cluster in book_id.items():
    assignments = list()
    for real,given in zip(chunk_class,sent_to_cluster):
        if real == cluster:
            assignments.append(given)
    most_common,num_most_common = Counter(assignments).most_common(1)[0] # 4, 6 times
    print(book,":",most_common,"-",num_most_common)
    print("Accuracy:",num_most_common/limit)
    print("------")

NameError: name 'book_id' is not defined

# Comparing male vs female

In [20]:
quotes = df_2016[:10000]
quotes_f = quotes[quotes['gender']=='Female']['quotation']
quotes_m = quotes[quotes['gender']=='Male']['quotation']

In [21]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

# MALE
processed_quotes_m = list()
for quote in nlp.pipe(quotes_m):#, n_threads=-1): #batch_size=10000): 

    # Process document using Spacy NLP pipeline.
    ents = quote.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    quote = [token.lemma_ for token in quote if token.is_alpha and not token.is_stop]
    
    # Remove common words from a stopword list and keep only words of length 3 or more.
    quote = [token for token in quote if token not in STOPWORDS and len(token) > 2]
    
    # Add named entities, but only if they are a compound of more than word.
    quote.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_quotes_m.append(quote)

quotes_m = processed_quotes_m
del processed_quotes_m

In [22]:
len(quotes_m)

9152

In [23]:
# FEMALE
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_quotes_f = list()
for quote in nlp.pipe(quotes_f):#, n_threads=-1): #batch_size=10000):

    # Process document using Spacy NLP pipeline.
    ents = quote.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    quote = [token.lemma_ for token in quote if token.is_alpha and not token.is_stop]
    
    # Remove common words from a stopword list and keep only words of length 3 or more.
    quote = [token for token in quote if token not in STOPWORDS and len(token) > 2]
    
    # Add named entities, but only if they are a compound of more than word.
    quote.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_quotes_f.append(quote)

quotes_f = processed_quotes_f
del processed_quotes_f

In [24]:
len(quotes_f)

848

In [25]:
# FEMALE
bigram = Phrases(quotes, min_count=15)

for idx in range(len(quotes_f)):
    for token in bigram[quotes_f[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            quotes_f[idx].append(token)
            
# MALE
for idx in range(len(quotes_m)):
    for token in bigram[quotes_m[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            quotes_m[idx].append(token)

In [26]:
# FEMALE
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary_f = Dictionary(quotes_f)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary_f.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus_f = [dictionary_f.doc2bow(quote) for quote in quotes_f]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary_f))
print('Number of chunks: %d' % len(corpus_f))


# MALE

dictionary_m = Dictionary(quotes_m)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary_m.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus_m = [dictionary_m.doc2bow(quote) for quote in quotes_m]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary_m))
print('Number of chunks: %d' % len(corpus_m))

Number of unique tokens: 333
Number of chunks: 848
Number of unique tokens: 3019
Number of chunks: 9152


In [27]:
# FEMALE models
from gensim.models import LdaMulticore
params = {'passes': 10, 'random_state': seed}
base_models = dict()
model_f = LdaMulticore(corpus=corpus_f, num_topics=5, id2word=dictionary_f, workers=6,
                passes=params['passes'], random_state=params['random_state'])

# MALE 
model_m = LdaMulticore(corpus=corpus_m, num_topics=5, id2word=dictionary_m, workers=6,
                passes=params['passes'], random_state=params['random_state'])

In [28]:
# plot topics
data_f =  gensimvis.prepare(model_f, corpus_f, dictionary_f, R=10)
pyLDAvis.display(data_f)

In [29]:
# plot topics
data_m =  gensimvis.prepare(model_m, corpus_m, dictionary_m, R=10)
pyLDAvis.display(data_m)