# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

## Data Prep

### Import Libraries

In [1]:
import pprint
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

### Load Data

In [2]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]
df.head(3)

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...


### Initial Prep

In [3]:
'''
This function takes as input a df and name of column (containing sentences) in the df.
The input is split to tokens which are lemmatized, and stopwords removed.
The output is a list of lists.  
'''
import re
def preprocess_text(text):    
    cleanTokens=[]
    #cleanText= ""
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in text:
        words = [lem.lemmatize(w) for w in word_tokenize(txt) if (w not in stop) and len(w)>2]
        cleanTokens.append(words)        

    return cleanTokens

In [8]:
df['cleanText'] = preprocess_text(df.text)
df.head()

Unnamed: 0,text,cleanTokens,cleanText
0,tv future in the hands of viewers with home th...,"[future, hand, viewer, home, theatre, system, ...","[future, hand, viewer, home, theatre, system, ..."
1,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor...","[worldcom, bos, left, book, alone, former, wor..."
2,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,...","[tiger, wary, farrell, gamble, leicester, say,..."
3,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, cup, premiership, s...","[yeading, face, newcastle, cup, premiership, s..."
4,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raid, box, office, ocean, twel...","[ocean, twelve, raid, box, office, ocean, twel..."


### Data Transformation

In [5]:
# transform data (options: TF-IDF, TF-IDF ngrams, word2vec, doc2vec)
def vectorizeStep(inputData, fittingData=df.text, outputFormat="tfidfUnfiltered"):
    
    # TF-IDF input
    if outputFormat == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.9, min_df=5
                                )
        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
    
    # TF-IDF ngrams input
    elif outputFormat == "tfidfFiltered":
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.9, min_df=5
                            )
        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
        
    return transformedData

In [6]:
vecCorpus = vectorizeStep(df.text, fittingData=df.text, outputFormat="tfidf")

## Modeling

### LDA

In [9]:
# BOW model
dic_bow = gensim.corpora.Dictionary(df.cleanText)
bow_corpus = [dic_bow.doc2bow(doc) for doc in df.cleanText]

In [10]:
# tfidf model
from gensim import corpora, models
tfidfGensim = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidfGensim[bow_corpus]

In [22]:
# LDA using TF-IDF
from gensim.models import LdaModel, LdaMulticore

lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=5, id2word=dic_bow, passes=2, workers=4)

In [17]:
# testing on a select output using LDA TF-IDF model
x = 51
for index, score in sorted(lda_model_tfidf[bow_corpus[x]], key=lambda tup: -1*tup[1]):
    print("\nTopic No: {}\t\nScore: {}\t\nTopic Model: {}".format(index, score, lda_model_tfidf.print_topic(index, 5)))


Topic No: 2	
Score: 0.9886234402656555	
Topic Model: 0.001*"phone" + 0.001*"government" + 0.001*"mobile" + 0.001*"music" + 0.001*"yukos"


In [19]:
# Top keywords in each topic
keywords = []

for i in range(len(df)):
    
    for index, score in sorted(lda_model_tfidf[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        elements = lda_model_tfidf.print_topic(index, 5).split("+")
        keywords.append([x.strip().replace('"', '').split("*")[1] for x in elements])

In [20]:
sorted(lda_model_tfidf[bow_corpus[1]], key=lambda tup: -1*tup[1])

[(2, 0.9953849)]

### LDA Interactive

In [None]:
# !pip install pyldavis

In [21]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model_tfidf, bow_corpus, dic_bow)
vis

  from imp import reload
  default_term_info = default_term_info.sort_values(


In [23]:
# Evaluating LDA models: Topic coherence
from gensim.models import CoherenceModel

goodLdaModel = LdaModel(corpus=bow_corpus, id2word=dic_bow, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=bow_corpus, id2word=dic_bow, iterations=1, num_topics=2)

goodcm = CoherenceModel(model=goodLdaModel, corpus=bow_corpus, dictionary=dic_bow, coherence='u_mass')
badcm  = CoherenceModel(model=badLdaModel, corpus=bow_corpus, dictionary=dic_bow, coherence='u_mass')

In [24]:
goodcm.get_coherence()

-1.6388275355777575

In [26]:
goodcm = CoherenceModel(model=goodLdaModel, texts=bow_corpus, dictionary=dic_bow, coherence='c_v')
badcm  = CoherenceModel(model=badLdaModel, texts=bow_corpus, dictionary=dic_bow, coherence='c_v')

In [27]:
goodcm.get_coherence()

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


nan

In [28]:
badcm.get_coherence()

nan

### LSI

In [30]:
# LSI
from gensim.models import LsiModel, CoherenceModel

lsi_model = LsiModel(corpus=corpus_tfidf, id2word=dic_bow, num_topics=5)
lsi_model.print_topics(-1)

  sparsetools.csc_matvecs(


[(0,
  '0.158*"party" + 0.143*"hague" + 0.089*"government" + 0.086*"price" + 0.085*"people" + 0.084*"rate" + 0.082*"england" + 0.080*"film" + 0.077*"donation" + 0.077*"music"'),
 (1,
  '-0.588*"hague" + -0.272*"party" + -0.160*"front" + -0.150*"ambition" + -0.122*"leadership" + -0.115*"politics" + -0.113*"william" + -0.113*"bench" + -0.103*"leader" + -0.103*"conservative"'),
 (2,
  '-0.204*"price" + -0.197*"rate" + -0.172*"growth" + -0.154*"bank" + -0.150*"economy" + -0.140*"oil" + -0.127*"crude" + -0.126*"quarter" + 0.125*"rugby" + -0.100*"market"'),
 (3,
  '-0.256*"hague" + -0.150*"price" + -0.128*"rate" + 0.125*"film" + 0.117*"party" + 0.108*"donation" + -0.107*"bank" + -0.106*"england" + 0.104*"people" + 0.104*"government"'),
 (4,
  '0.580*"film" + 0.176*"festival" + 0.174*"dicaprio" + 0.161*"scholl" + 0.138*"award" + 0.115*"starring" + 0.108*"hague" + 0.090*"hill" + 0.081*"rating" + 0.080*"halloween"')]

In [31]:
lsi_model.print_topics(num_topics=5, num_words=10)

[(0,
  '0.158*"party" + 0.143*"hague" + 0.089*"government" + 0.086*"price" + 0.085*"people" + 0.084*"rate" + 0.082*"england" + 0.080*"film" + 0.077*"donation" + 0.077*"music"'),
 (1,
  '-0.588*"hague" + -0.272*"party" + -0.160*"front" + -0.150*"ambition" + -0.122*"leadership" + -0.115*"politics" + -0.113*"william" + -0.113*"bench" + -0.103*"leader" + -0.103*"conservative"'),
 (2,
  '-0.204*"price" + -0.197*"rate" + -0.172*"growth" + -0.154*"bank" + -0.150*"economy" + -0.140*"oil" + -0.127*"crude" + -0.126*"quarter" + 0.125*"rugby" + -0.100*"market"'),
 (3,
  '-0.256*"hague" + -0.150*"price" + -0.128*"rate" + 0.125*"film" + 0.117*"party" + 0.108*"donation" + -0.107*"bank" + -0.106*"england" + 0.104*"people" + 0.104*"government"'),
 (4,
  '0.580*"film" + 0.176*"festival" + 0.174*"dicaprio" + 0.161*"scholl" + 0.138*"award" + 0.115*"starring" + 0.108*"hague" + 0.090*"hill" + 0.081*"rating" + 0.080*"halloween"')]

In [33]:
# Determining optimum number of topics using coherence values 

coherence_values = []
model_list = []
min_topics, max_topics, step = 1, 10, 1
for i in range(min_topics, max_topics, step):
    model = LsiModel(corpus_tfidf, id2word=dic_bow, num_topics=i)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=df.cleanText, \
        dictionary=dic_bow, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

  sparsetools.csc_matvecs(


In [None]:
import matplotlib.pyplot as plt
x = range(min_topics, max_topics, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc="best")
plt.show()

### Evaluate

### Top Keywords

### Add Keywords to DataFrame