In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
stopwords = set(stopwords.words('english'))

In [2]:
vox = pd.read_json('./data/voxtest.jsonl', lines=True)

In [3]:
trans = str.maketrans('', '', string.punctuation + '0123456789')
stemmer = SnowballStemmer('english')


def filter_func(document):
    result = []
    wordlist = nltk.word_tokenize(document.lower().translate(trans))
    for word in wordlist:
        c1 = word not in stopwords 
        c2 = len(word) > 2
        c3 = not word.startswith('https')
        c4 = not re.match('document[a-z]+', word)
        if c1 and c2 and c3 and c4:
            result.append(stemmer.stem(word.encode('ascii', 'ignore').decode('UTF-8')))
    return result

vox['id'] = vox._id.apply(lambda x: x['$oid'])
vox.drop('_id', axis=1, inplace=True)
vox.text = vox.text.apply(lambda x: filter_func(x))

In [4]:
y = vox.date.apply(lambda x: x[:-1] + 'AM' if x.endswith('a') else x[:-1] + 'PM' if x.endswith('p') else x)
y = y.apply(lambda x: x[:12] if x.endswith('M') else x)

for row in y.iteritems():
    try:
        if row[1] == 'NULL':
            y[row[0]] = np.nan
        else:
            y[row[0]] = pd.to_datetime(row[1])
    except:
        pass

vox.date = y

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tf', CountVectorizer()),
    ('lda', LatentDirichletAllocation(learning_method='online', max_iter=50))
])

parameters = {
    'lda__n_topics': range(5, 10)
}

gs = GridSearchCV(pipeline, parameters)
gs.fit(vox.text.apply(lambda x: ' '.join(x)))


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

feature_names = gs.best_estimator_.steps[0][1].get_feature_names()
lda = gs.best_estimator_.steps[1][1]
print_top_words(lda, feature_names, 4)

Topic #0:
one like show make
Topic #1:
drug food health use
Topic #2:
coal vox newslett mine
Topic #3:
trump peopl would like
Topic #4:
tardigrad flint moor confeder



In [114]:
cv = gs.best_estimator_.steps[0][1]
temp = lda.transform(cv.transform(vox.text.apply(lambda x: ' '.join(x))))
temp = pd.DataFrame(temp)

In [115]:
temp.columns = ['topic' + str(t) for t in temp.columns]
temp = temp.head(10)

In [116]:
def func(x):
    return x.index[x == np.max(x)]
temp['maxcol']=temp.apply(lambda x: func(x)func(x), axis=1)

In [117]:
temp

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,maxcol
0,0.000175,0.000179,0.000173,0.999301,0.000172,topic3
1,0.998519,0.000371,0.000369,0.000375,0.000366,topic0
2,0.000427,0.000424,0.000422,0.826493,0.172235,topic3
3,0.018192,0.000685,0.000693,0.97975,0.000679,topic3
4,0.000615,0.000615,0.000622,0.92803,0.070118,topic3
5,0.000202,0.232004,0.0002,0.767397,0.000198,topic3
6,0.99896,0.000262,0.000259,0.000264,0.000256,topic0
7,0.58474,0.414646,0.000204,0.000207,0.000202,topic0
8,0.048396,0.000452,0.00046,0.950244,0.000448,topic3
9,0.017117,0.000392,0.000392,0.981709,0.000389,topic3
