In [1]:
import os
import re
import numpy as np
import pandas as pd
from pprint import pprint

#NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
stop = stopwords.words('english')
sno = SnowballStemmer('english')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from sklearn.datasets import fetch_20newsgroups
import pickle

from gsdmm import MovieGroupProcess


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anidel93\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Wall time: 2.89 s


In [3]:
# Import Dataset
dataRaw = pd.read_csv('DashStories.csv', encoding = "ISO-8859-1")
dataRaw.head()

Unnamed: 0,story id,submitted,permission,country,story,dash title,handle,author department(s),comments,words
0,7616,11/28/2018 3:39,may_share,US,I'm interested in the Godel's take on the stru...,On Godel's Way In: The Influence of Rudolf Carnap,,,,19
1,7615,11/27/2018 23:32,may_share,PA,Good Morning\n\nI have written some articles o...,Staging Lesbian and Gay New York,,,,93
2,7613,11/27/2018 16:41,may_share,US,"well, I am a debater at Bingham High school in...","U.S. High-Skilled Immigration, Innovation, and...",,,,37
3,7612,11/27/2018 4:23,may_share,GB,I am a Safeguarding adviser in the Church of E...,Social Dominance Orientation: A Personality Va...,,,,98
4,7611,11/26/2018 22:35,may_share,DE,ood Morning\n\nI am writing to you on behalf o...,Staging Lesbian and Gay New York,,,,70


In [4]:
# Convert to list
data = dataRaw.story.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [6]:
data_words = list(sent_to_words(data))

In [7]:
%%time
# Remove Stop Words
stop_words = stopwords.words('english')
data_words_nostops = remove_stopwords(data_words)


# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Wall time: 36.5 s


In [8]:
docs = data_lemmatized

In [70]:
%%time
# Train STTM model
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori)
# alpha = 
# beta = 
# n_iters = number of iterations to 
mgp = MovieGroupProcess(K=20, alpha=0.001, beta=0.15, n_iters=15)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)
# Save model
with open('v1.model', 'wb') as f:
 pickle.dump(mgp, f)
 f.close()

In stage 0: transferred 3144 clusters with 20 clusters populated
In stage 1: transferred 2076 clusters with 20 clusters populated
In stage 2: transferred 1254 clusters with 18 clusters populated
In stage 3: transferred 874 clusters with 17 clusters populated
In stage 4: transferred 719 clusters with 14 clusters populated
In stage 5: transferred 630 clusters with 14 clusters populated
In stage 6: transferred 526 clusters with 12 clusters populated
In stage 7: transferred 534 clusters with 11 clusters populated
In stage 8: transferred 530 clusters with 11 clusters populated
In stage 9: transferred 510 clusters with 11 clusters populated
In stage 10: transferred 501 clusters with 11 clusters populated
In stage 11: transferred 473 clusters with 11 clusters populated
In stage 12: transferred 491 clusters with 11 clusters populated
In stage 13: transferred 469 clusters with 10 clusters populated
In stage 14: transferred 510 clusters with 10 clusters populated
Wall time: 59.2 s


In [71]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-20:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

Number of documents per topic : [   5    0    0   19    4    0    0  956    0    0   50    0    2    0
    0   36 1711    0    3  790]
********************
Most important clusters (by number of docs inside): [16  7 19 10 15  3  0  4 18 12 11 13  8 14  6  5 17  2  1  9]
********************


In [49]:
test = []
test.append(str(mgp.cluster_word_distribution[19]))
test = [test[0].replace("'",'')]
test = [test[0].replace("{",'')]
test = [test[0].replace("}",'')]
test2 = test[0].split(', ')
#test2

test3 = []
for i in range(len(test2)):
    test3.append(test2[i].split(': '))
#test3 

top_word = pd.DataFrame(test3)
top_word2 = top_word.sort_values(by=[1])
top_word2.tail(20)


Unnamed: 0,0,1
161,afford,9
391,in,9
1094,real,9
897,sequence,9
786,suggest,9
393,rise,9
969,reduce,9
158,peer,9
679,communication,9
693,figure,9


In [50]:
top_word2.to_csv('topic9.csv')

In [19]:
label = []
for sent in docs:
    temp = mgp.choose_best_label(sent)
    label.append(temp)
label

[(1, 0.9708372250932703),
 (17, 1.0),
 (1, 0.9920822314512151),
 (17, 0.9999999999943517),
 (17, 1.0),
 (10, 0.9999999999995439),
 (10, 0.9999004837561386),
 (10, 0.9999998596021767),
 (10, 0.9988900784338306),
 (10, 0.9999285401193723),
 (10, 0.9999997577901524),
 (17, 0.9999999200534797),
 (10, 0.9999607060168618),
 (10, 0.9999988976304464),
 (10, 0.9999999999999993),
 (1, 0.9999999988844572),
 (17, 0.9999999929259357),
 (10, 0.9999999999997242),
 (1, 0.5846630162124004),
 (1, 0.9999997440754622),
 (17, 0.9282471141044578),
 (19, 1.0),
 (10, 0.9998841565776853),
 (1, 0.9995058397328745),
 (10, 0.9987060332197816),
 (17, 0.9999297409417736),
 (10, 0.9788561515072407),
 (10, 0.9698560589069539),
 (10, 0.5223457898494056),
 (10, 0.46242324332582285),
 (19, 0.9820910393947663),
 (10, 0.9901883724989137),
 (10, 0.9375244199915038),
 (10, 0.9995813282325977),
 (10, 0.9994742435211468),
 (1, 0.9999990327234108),
 (10, 0.999999496830104),
 (10, 0.9964665153130455),
 (17, 0.9999999975964582),

In [40]:
label[1][1]

1.0