# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

## Data Prep

### Import Libraries

In [1]:
import pprint
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

### Load Data

In [5]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]
df.head(3)

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...


### Initial Prep

In [26]:
'''
This function takes as input a df and name of column (containing sentences) in the df.
The input is split to tokens which are lemmatized, and stopwords removed.
The output is a list of lists.  
'''
import re
def preprocess_text(text):    
    cleanTokens=[]
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in text:
        words = [lem.lemmatize(w) for w in word_tokenize(txt) if (w not in stop) and len(w)>2]
        cleanTokens.append(words)        

    return cleanTokens 

In [28]:
#df['cleanTokens'] = preprocess_text(df.text)
df.head()

Unnamed: 0,text,cleanTokens,cleanText
0,tv future in the hands of viewers with home th...,"[future, hand, viewer, home, theatre, system, ...",
1,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor...",
2,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,...",
3,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, cup, premiership, s...",
4,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raid, box, office, ocean, twel...",


### Data Transformation

In [None]:
# transform data (options: TF-IDF, TF-IDF ngrams, word2vec, doc2vec)
def vectorizeStep(inputData, fittingData=df.text, outputFormat="tfidfUnfiltered"):
    
    # TF-IDF input
    if outputFormat == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.9, min_df=5
                                )
        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
    
    # TF-IDF ngrams input
    elif outputFormat == "tfidfFiltered":
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.9, min_df=5
                            )
        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
        
    return transformedData

## Modeling

In [30]:
# Getting tokens to use with Gensim

tokens = []
for i, row in df.iterrows():
    text = row["text"]
    words = text.split()
    tokens.append(words)

In [33]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

# TF-IDF with basic cleaning
dictionaryLtd = Dictionary(tokens)
dtmLtd = [dictionaryLtd.doc2bow(doc) for doc in tokens]
tfidfVectorizerLtd = TfidfModel(dtmLtd) 
tfidfLtd = tfidfVectorizerLtd[dtmLtd]

# TF-IDF with term frequency filter cleaning
dictionaryFiltered = Dictionary(tokens)
dictionaryFiltered.filter_extremes(no_below=5, no_above=0.90)
dtmFiltered = [dictionaryFiltered.doc2bow(doc) for doc in tokens]
tfidfVectorizerFiltered = TfidfModel(dtmFiltered) 
tfidfFiltered = tfidfVectorizerFiltered[dtmFiltered]

# TF-IDF with only nouns, noun phrases and NER
dictionaryNouns = Dictionary(tokens)
dtmNouns = [dictionaryNouns.doc2bow(doc) for doc in tokens]
tfidfVectorizerNouns = TfidfModel(dtmNouns) 
tfidfNouns = tfidfVectorizerNouns[dtmNouns]

### LDA

In [36]:
from gensim.models import LdaModel, LdaMulticore

# tfidfLtd
lda_Ltd = LdaModel(corpus=tfidfLtd, id2word=dictionaryLtd, num_topics=10)

# tfidfFiltered
lda_Filtered = LdaModel(corpus=tfidfFiltered, id2word=dictionaryFiltered, num_topics=10)

# tfidfNouns
lda_Nouns = LdaModel(corpus=tfidfNouns, id2word=dictionaryNouns, num_topics=10)

In [37]:
# Top keywords in each topic
keywords = []

for i in range(len(df)):
    
    for index, score in sorted(lda_Ltd[tfidfLtd[i]], key=lambda tup: -1*tup[1]):
        elements = lda_Ltd.print_topic(index, 5).split("+")
        keywords.append([x.strip().replace('"', '').split("*")[1] for x in elements])

In [38]:
# testing on a select output using LDA TF-IDF model
x = 51
for index, score in sorted(lda_Ltd[tfidfLtd[x]], key=lambda tup: -1*tup[1]):
    print("\nTopic No: {}\t\nScore: {}\t\nTopic Model: {}".format(index, score, lda_Ltd.print_topic(index, 5)))


Topic No: 1	
Score: 0.9272080659866333	
Topic Model: 0.001*"sayeed" + 0.001*"mr" + 0.001*"howard" + 0.001*"worldcom" + 0.001*"pension"


In [None]:
sorted(lda_model_tfidf[bow_corpus[1]], key=lambda tup: -1*tup[1])

### LDA Interactive

In [None]:
# !pip install pyldavis

In [None]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model_tfidf, bow_corpus, dic_bow)
vis

In [None]:
# Evaluating LDA models: Topic coherence
from gensim.models import CoherenceModel

goodLdaModel = LdaModel(corpus=bow_corpus, id2word=dic_bow, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=bow_corpus, id2word=dic_bow, iterations=1, num_topics=2)

goodcm = CoherenceModel(model=goodLdaModel, corpus=bow_corpus, dictionary=dic_bow, coherence='u_mass')
badcm  = CoherenceModel(model=badLdaModel, corpus=bow_corpus, dictionary=dic_bow, coherence='u_mass')

In [None]:
goodcm = CoherenceModel(model=goodLdaModel, texts=bow_corpus, dictionary=dic_bow, coherence='c_v')
badcm  = CoherenceModel(model=badLdaModel, texts=bow_corpus, dictionary=dic_bow, coherence='c_v')

### LSI

In [35]:
from gensim.models import LsiModel, CoherenceModel

# tfidfLtd
lsi_Ltd = LsiModel(corpus=tfidfLtd, id2word=dictionaryLtd, num_topics=10)

# tfidfFiltered
lsi_Filtered = LsiModel(corpus=tfidfFiltered, id2word=dictionaryFiltered, num_topics=10)

# tfidfNouns
lsi_Nouns = LsiModel(corpus=tfidfNouns, id2word=dictionaryNouns, num_topics=10)

### Evaluate

### Top Keywords

### Add Keywords to DataFrame