# Reading data

In [84]:
import pandas as pd
from utils import *

In [85]:
data = pd.read_csv(osp.join(csvdata, 'articles_bbc_2018_01_30.csv'))

In [86]:
data.shape

(309, 2)

In [87]:
data = data.dropna().reset_index(drop=True)

In [88]:
data.shape

(308, 2)

In [89]:
data

Unnamed: 0,articles,lang
0,Image copyright PA/EPA Image caption Oligarch ...,en
1,Husband admits killing French jogger\r\n\r\nTh...,en
2,Media playback is unsupported on your device M...,en
3,Manchester City's Leroy Sane is ruled out for ...,en
4,Image copyright AFP Image caption Sebastien Br...,en
...,...,...
303,فيديو\r\n\r\nكيف تعبر الحدود...مثل الفيل؟!\r\n...,ar
304,بالصور\r\n\r\nمعالم لندن تحت الأضواء\r\n\r\nمع...,ar
305,يقدم لكم تلفزيون بي بي سي العربي الأخبار والأخ...,ar
306,موجات FM\r\n\r\nنبث إرسالنا على موجات إف إم في...,ar


# Cleaning

#### Keeping English articles

In [90]:
from langdetect import detect
from tqdm import tqdm_notebook
from tqdm import notebook
notebook.tqdm().pandas()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

In [91]:
data

Unnamed: 0,articles,lang
0,Image copyright PA/EPA Image caption Oligarch ...,en
1,Husband admits killing French jogger\r\n\r\nTh...,en
2,Media playback is unsupported on your device M...,en
3,Manchester City's Leroy Sane is ruled out for ...,en
4,Image copyright AFP Image caption Sebastien Br...,en
...,...,...
303,فيديو\r\n\r\nكيف تعبر الحدود...مثل الفيل؟!\r\n...,ar
304,بالصور\r\n\r\nمعالم لندن تحت الأضواء\r\n\r\nمع...,ar
305,يقدم لكم تلفزيون بي بي سي العربي الأخبار والأخ...,ar
306,موجات FM\r\n\r\nنبث إرسالنا على موجات إف إم في...,ar


In [94]:
data['lang'] = detect(data.articles)

TypeError: expected string or bytes-like object

In [95]:
data.lang.value_counts()

en    256
fa      9
fr      8
id      5
uk      4
hi      4
ru      4
vi      4
ar      4
sw      3
pt      2
tr      2
es      2
de      1
Name: lang, dtype: int64

In [96]:
data = data.loc[data.lang=='en']
data.articles


0      Image copyright PA/EPA Image caption Oligarch ...
1      Husband admits killing French jogger\r\n\r\nTh...
2      Media playback is unsupported on your device M...
3      Manchester City's Leroy Sane is ruled out for ...
4      Image copyright AFP Image caption Sebastien Br...
                             ...                        
299    Image copyright Getty Images Image caption Cou...
300    The key piece of advice: "Do the writing. Writ...
301    Image copyright NASA Image caption Nasa develo...
302    To coincide with the Writersroom comedy submis...
307    Hi I am the head of product for BBC News Onlin...
Name: articles, Length: 256, dtype: object

#### Tokenization

In [97]:
from nltk.tokenize import sent_tokenize

In [98]:
data['sentences'] = data.articles.apply(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentences'] = data.articles.apply(sent_tokenize)


In [100]:
# data
data['sentences'].head(1).tolist()[0][:3]

['Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list\r\n\r\nRussian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.',
 'The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.',
 'However, the US stressed those named were not subject to new sanctions.']

In [11]:
from nltk.tokenize import word_tokenize

In [12]:
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(data['tokens_sentences'].head(1).tolist()[0][:3])

KeyError: 'sentences'

#### Lemmatizing with POS tagging

In [None]:
from nltk import pos_tag

In [None]:
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(data['POS_tokens'].head(1).tolist()[0][:3])

In [None]:
# Inspired from https://stackoverflow.com/a/15590384
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [None]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

#### Regrouping tokens and removing stop words

In [None]:
from nltk.corpus import stopwords
stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']
stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something']
my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

In [None]:
from itertools import chain # to flatten list of sentences of tokens into list of tokens

In [None]:
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [None]:
data['tokens'].head(1).tolist()[0][:30]

# LDA

## Data preparation

#### Prepare bi-grams and tri-grams

In [None]:
from gensim.models import Phrases

In [None]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

#### Prepare objects for LDA gensim implementation

In [None]:
from gensim import corpora

In [None]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## Running LDA

In [None]:
from gensim import models
import numpy as np

In [None]:
np.random.seed(123456)
num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

## Quick exploration of LDA results

#### Looking at topics

In [None]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

#### Allocating topics to documents

In [None]:
print(data.articles.loc[0][:500])

In [None]:
lda_model[corpus[0]]

#### Predicting topics on unseen documents

In [None]:
document = '''Eric Tucker, a 35-year-old co-founder of a marketing company in Austin, Tex., had just about 40 Twitter followers. But his recent tweet about paid protesters being bused to demonstrations against President-elect Donald J. Trump fueled a nationwide conspiracy theory — one that Mr. Trump joined in promoting. 

Mr. Tucker's post was shared at least 16,000 times on Twitter and more than 350,000 times on Facebook. The problem is that Mr. Tucker got it wrong. There were no such buses packed with paid protesters.

But that didn't matter.

While some fake news is produced purposefully by teenagers in the Balkans or entrepreneurs in the United States seeking to make money from advertising, false information can also arise from misinformed social media posts by regular people that are seized on and spread through a hyperpartisan blogosphere.

Here, The New York Times deconstructs how Mr. Tucker’s now-deleted declaration on Twitter the night after the election turned into a fake-news phenomenon. It is an example of how, in an ever-connected world where speed often takes precedence over truth, an observation by a private citizen can quickly become a talking point, even as it is being proved false.'''
tokens = word_tokenize(document)
topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
pd.DataFrame([(el[0], round(el[1],2), topics[el[0]][1]) for el in lda_model[dictionary_LDA.doc2bow(tokens)]], columns=['topic #', 'weight', 'words in topic'])

## Advanced exploration of LDA results

#### Allocation of topics in all documents

In [None]:
topics = [lda_model[corpus[i]] for i in range(len(data))]

In [None]:
def topics_document_to_dataframe(topics_document, num_topics):
    res = pd.DataFrame(columns=range(num_topics))
    for topic_weight in topics_document:
        res.loc[0, topic_weight[0]] = topic_weight[1]
    return res

topics_document_to_dataframe([(9, 0.03853655432967504), (15, 0.09130117862212643), (18, 0.8692868808484044)], 20)

In [None]:
# Like TF-IDF, create a matrix of topic weighting, with documents as rows and topics as columns
document_topic = \
pd.concat([topics_document_to_dataframe(topics_document, num_topics=num_topics) for topics_document in topics]) \
  .reset_index(drop=True).fillna(0)

In [None]:
document_topic.head()

In [None]:
# Which document are about topic 14
document_topic.sort_values(14, ascending=False)[14].head(20)

In [None]:
print(data.articles.loc[91][:1000])

#### Looking at the distribution of topics in all documents

In [None]:
%matplotlib inline
import seaborn as sns; sns.set(rc={'figure.figsize':(10,20)})
sns.heatmap(document_topic.loc[document_topic.idxmax(axis=1).sort_values().index])

In [None]:
sns.set(rc={'figure.figsize':(10,5)})
document_topic.idxmax(axis=1).value_counts().plot.bar(color='lightblue')

#### Visualizing topics

In [None]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)