In [11]:
import os
import time
import glob
import json
from scipy.sparse import save_npz, load_npz

import string
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import entropy
from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.mpl.rc("figure", figsize=(9,6))

import warnings; warnings.filterwarnings('ignore')

In [12]:
df_allnews = pd.DataFrame()
for year in range(2013,2019):
    infile = 'guardian_data/%d/allnews_%d.csv' % (year, year)
    df_year = pd.read_csv(infile, sep='\t')
    
    df_year = df_year[pd.notnull(df_year['text'])]
    df_year = df_year[df_year['text'].apply(len)>30]
        
    df_allnews = df_allnews.append(df_year)
    
df_allnews.head(3)

Unnamed: 0,headline,id,pubDate,sectionId,sectionName,text,trailText,webUrl
0,Fears for hostages as Algeria attacks gas comp...,world/middle-east-live/2013/jan/17/algerian-is...,2013-01-18T09:11:02Z,world,World news,We're closing this live blog but coverage will...,<p>• Reports of deaths of hostages and kidnapp...,https://www.theguardian.com/world/middle-east-...
1,Lance Armstrong admits doping in Oprah intervi...,sport/2013/jan/17/lance-armstrong-oprah-winfre...,2013-01-18T10:10:08Z,sport,Sport,"So, what did we learn here? • That Lance Armst...",<p>Lance Armstrong talks to Oprah Winfrey in h...,https://www.theguardian.com/sport/2013/jan/17/...
2,Venus Williams v Maria Sharapova – as it happened,sport/2013/jan/18/venus-williams-maria-sharapo...,2013-01-18T10:15:22Z,sport,Sport,"A few words from the victor: ""Both of us were ...",<p>Maria Sharapova was in brutal form as she d...,https://www.theguardian.com/sport/2013/jan/18/...


In [13]:
def tokenize(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    minlength = 3
    
    invalidChars = { '¡', '§', '©', '\xad', '°', '²', '³', 'µ', '¹', '¿', '×', '\u200b', 
                    '•', '‣', '…', '⁄', '₂', '€', '™', '▇', '■', '▶', '◆', '●', '★', '✽',
                    '❏', '➝', '主', '原', '年', '後', '歸', '物', '舧', '舰'}
    invalidChars = invalidChars.union(set(string.punctuation.replace("-", "–„“")))
    
    for token in nltk.word_tokenize(text):
        t = token.lower()
        if (len(t)<minlength) or (t in stopwords) \
        or (t in string.punctuation) or (t[0] in string.punctuation) \
        or any(char in invalidChars for char in token):
            continue
        yield t
        
def normalise(vec):
    return vec / np.dot(vec,vec)

def combine_vectors(vectors):
    return normalise(np.sum(vectors, axis=0))

def important_words(vectorizer, vec, n):
    return sorted(zip(vectorizer.get_feature_names(), vec), key=lambda x:x[1], reverse=True)[:n]

### Now let's estimate how long it's going to take.
_________________________________

25 topics (time in minutes)
[
[100, 1.34],
[200, 3.04],
[500, 4.15],
[1000, 7.44]
]

Ok, from this I get that the time in minutes is:
T = 1.1726 + 0.00627*n_documents

- 10K documents    ->  60 minutes
- 403317 documents ->  2500 minutes (1.8 days)

_________________________________

50 topics (time in minutes)
[
[100, 2.14],
[200, 5.46],
[500, 8.02],
[1000, 15.64],
[2000, 22.82],
]
Ok, from this I get that the time in minutes is:
T = 2.78 + 0.01057*n_documents

- 10K documents    ->  100 minutes (2 hours)
- 403317 documents ->  4200 minutes (3 days)


In [14]:
#df = df_allnews.head(200)
df = df_allnews
print(len(df),"documents in total")

403317 documents in total


In [88]:
i_t = []

for i,t in enumerate(df.text):
    if len(t) < 60:
        print(i,t)
        i_t += [ [i,t]]

142 See the painting here Read more here
2074 Where's Eddie? Can you spot the world's most wanted man?
3840 Todays picture from the past is the Grand Canyon.
5106 Share your own tributes in the comments section below
12950 Our rolling coverage of the budget is here
13125 Sents this year, should you be inspired ...
22982 Stylewatch – the Guardian’s favourite outfits on the planet
23969 Retail giant takes on Apple and Samsung
24382 They ought to ban him for not biting hard enough
25145 Barkers dog grooming chain is pet retailer's latest idea
26018 The pound rises against the dollar
26219 Watch the story of the Belarus Free Theatre in full here:
27352 Six-year wait for retailer's financial project
29381 Paper London Atom iPhone Cover, eBay, £29.95
30892 There are no Guardian Australia masterclasses scheduled
32572 From the no camp ... From the yes camp ...
52715 More from Fashion buy of the day
53473 Burger King $11bn Tim Hortons takeover.
58034 It’s Thursday – sorry I’m late, chatter fan

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
start = time.time()

# list of text documents
text = df.text.values
doc_ids = df.id.values

# create the transform
vectorizer = CountVectorizer(tokenizer=tokenize, min_df=0.005, max_df=0.8)

# tokenize and build vocab
vectorizer.fit(text)

# summarize
#print(vectorizer.vocabulary_)

# encode document-term matrix
dtm = vectorizer.transform(text)

In [78]:
# summarize encoded vector
print('Shape of document-term matrix (documents, tokens):', dtm.shape)
print('Total number of tokens:', dtm.sum() )
#print(type(dtm))
#print(dtm.toarray())

end = time.time()
print(round((end - start)/60.0,2),'minutes')

save_npz('guardian_data/dtm_matrix.npz', dtm)

Shape of document-term matrix (documents, tokens): (403317, 8727)
Total number of tokens: 153055379
119.43 minutes


In [43]:
dtm = load_npz('guardian_data/dtm_matrix.npz')

In [44]:
dtm

<403317x8727 sparse matrix of type '<class 'numpy.int64'>'
	with 99571826 stored elements in Compressed Sparse Row format>

In [None]:
nnz_per_row = dtm.getnnz(axis=1)
non_null_rows = np.where(nnz_per_row > 0)[0]
null_rows     = np.where(nnz_per_row <= 0)[0]

In [32]:
dtm = dtm[dtm.getnnz(1)>0]

In [33]:
dtm

<403301x8727 sparse matrix of type '<class 'numpy.int64'>'
	with 99571826 stored elements in Compressed Sparse Row format>

In [7]:
import lda

n_topics = 50
topic_model = lda.LDA(n_topics=n_topics, n_iter=1500, random_state=1)

In [8]:
start = time.time()

document_topic_distributions = topic_model.fit_transform(dtm)

end = time.time()
print(round((end - start)/60.0,2),'minutes')

INFO:lda:n_documents: 403301
INFO:lda:vocab_size: 8727
INFO:lda:n_words: 153055379
INFO:lda:n_topics: 50
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1908050848
INFO:lda:<10> log likelihood: -1583325338
INFO:lda:<20> log likelihood: -1411131935
INFO:lda:<30> log likelihood: -1381605888
INFO:lda:<40> log likelihood: -1370863644
INFO:lda:<50> log likelihood: -1365572688
INFO:lda:<60> log likelihood: -1362869870
INFO:lda:<70> log likelihood: -1361116591
INFO:lda:<80> log likelihood: -1359896117
INFO:lda:<90> log likelihood: -1358975711
INFO:lda:<100> log likelihood: -1358101566
INFO:lda:<110> log likelihood: -1357567030
INFO:lda:<120> log likelihood: -1357108876
INFO:lda:<130> log likelihood: -1356701975
INFO:lda:<140> log likelihood: -1356284038
INFO:lda:<150> log likelihood: -1355947002
INFO:lda:<160> log likelihood: -1355649199
INFO:lda:<170> log likelihood: -1355393339
INFO:lda:<180> log likelihood: -1355132053
INFO:lda:<190> log likelihood: -1354900571
INFO:lda:<200> log likel

1637.52 minutes


In [51]:
vocab = vectorizer.get_feature_names()
topic_names = ['Topic %d'%k for k in range(1, n_topics + 1)]

topic_word_distributions = pd.DataFrame(topic_model.components_, columns=vocab, index=topic_names)

document_topic_distributions = pd.DataFrame(document_topic_distributions,
                                            columns=topic_names,
                                            index=doc_ids[non_null_rows])

In [52]:
document_topic_distributions.to_csv('guardian_data/document_topic_distributions_'+str(n_topics)+'topics.csv')
topic_word_distributions.to_csv('guardian_data/topic_word_distributions_'+str(n_topics)+'topics.csv')

In [56]:
topic_word_distributions.loc['Topic 1'].sort_values(ascending=False).head(15)

growth      0.012292
economy     0.009615
bank        0.009512
year        0.008162
market      0.007252
markets     0.006865
said        0.006764
rate        0.006681
greece      0.006339
also        0.006335
economic    0.006279
last        0.006176
rates       0.005592
since       0.005583
prices      0.005391
Name: Topic 1, dtype: float64