# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

#### Import Libraries

In [1]:
import pprint
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

#### Load Data

In [2]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...


#### Initial Prep

In [3]:
'''
This function takes as input a df and name of column (containing sentences) in the df.
The input is split to tokens which are lemmatized, and stopwords removed.
The output is a list of lists.  
'''
import re
def preprocess_text(text):    
    cleanTokens=[]
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in text:
        words = [lem.lemmatize(w) for w in word_tokenize(txt) if (w not in stop) and len(w)>2]
        cleanTokens.append(words)        

    return cleanTokens

#### Modeling

In [4]:
'''
This function takes as input a string of text and returns a list of nouns, noun phrases and named entities.
The function has a high complexity, and there may be more efficient ways to go about it.
However, this gives me the output I desire more compared to available methods/packages.
'''
import nltk
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

def getNouns(text):
        from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
        from nltk.tree import Tree
        
        global nouns
        nouns = []

        for sentence in sent_tokenize(text):
                for word, pos in pos_tag(word_tokenize(sentence)):
                        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
                                nouns.append(word)

                chunked = ne_chunk(pos_tag(word_tokenize(sentence)))
                continuous_chunk = []
                current_chunk = []
                
                for i in chunked:
                        if type(i) == Tree:
                                current_chunk.append(" ".join([token for token, pos in i.leaves()]))
                        if current_chunk:
                                named_entity = " ".join(current_chunk)
                                if named_entity not in nouns:
                                        nouns.append(named_entity)
                                        current_chunk = []
                        else:
                                continue
        return nouns

In [5]:
'''
This function takes as input a dataframe, a column name (str) and a a TFIDF data format (basic, filtered & nouns)
basic -> only basic preprocessing done before building tfidf
filtered -> tfidf filtered to remove top 10% of the most frequent words and words that appear less than 5 times in the documents
nouns -> tfidf built on text limited to nouns, noun phrases, and named entity recognition.

It returns a dictionary and a TF-IDF corpus.
'''

def getCorpus(df=df, column="text", tfidfFormat="basic"):
    from gensim.corpora import Dictionary
    from gensim.models import TfidfModel

    # TF-IDF with basic cleaning
    if tfidfFormat=="basic":
        tokens = preprocess_text(df[column])
        dictionary = Dictionary(tokens)
        dtm = [dictionary.doc2bow(doc) for doc in tokens]
        vectorizer = TfidfModel(dtm)
        tfidfCorpus = vectorizer[dtm]
        return dictionary, tfidfCorpus
    
    # TF-IDF with term frequency filter cleaning    
    elif tfidfFormat=="filtered":
        tokens = preprocess_text(df[column])
        dictionary = Dictionary(tokens)
        dictionary.filter_extremes(no_below=5, no_above=0.90)
        dtm = [dictionary.doc2bow(doc) for doc in tokens]
        vectorizer = TfidfModel(dtm)
        tfidfCorpus = vectorizer[dtm]
        return dictionary, tfidfCorpus

    # TF-IDF with only nouns, noun phrases and NER
    elif tfidfFormat=="nouns":
        tokens = df[column].apply(lambda x: getNouns(x))
        dictionary = Dictionary(tokens)
        dtm = [dictionary.doc2bow(doc) for doc in tokens]
        vectorizer = TfidfModel(dtm)
        tfidfCorpus = vectorizer[dtm]
        return dictionary, tfidfCorpus               

In [6]:
'''
This function takes as input a dictionary, a corpus, the type of model (lda or lsi) and the number of topics.
It builds a model using these parameters and return the model.
'''

def buildModel(dictionary, corpus, modelType:str, num_topics):
    from gensim.models import LsiModel,LdaModel

    if modelType=="lda":
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        return model
    
    elif modelType=="lsi":
        model = LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        return model

In [7]:
'''
This function takes as input a corpus and a model (like the one returned by the buildModel function).
It returns a list of keywords found from running on the corpus
'''

def getKeywords(model, corpus):
    n = len(corpus)
    keywords = []

    for i in range(n):    
        for index, score in sorted(model[corpus[i]], key=lambda tup: -1*tup[1]):
            elements = model.print_topic(index, 5).split("+")
            keywords.append([x.strip().replace('"', '').split("*")[1] for x in elements])
    keywords = keywords[:n]
    return keywords

In [8]:
'''
In this step we create a list of formats (basic/filtered/nouns) and model types (lda/lsa) which we will use to build our models.
We use getKeywords() function and add new columns to the initial dataframe with these keywords.
'''

lstFormats = ['basic', 'filtered', 'nouns']
modelTypes = ['lda', 'lsi']

# iterate through the list of TF-IDF corpus formats
for lstFormat in lstFormats:
    dictionary, corpus = getCorpus(df=df, column="text", tfidfFormat=lstFormat)

    # iterate through the list of model types
    for modelType in modelTypes:
        model = buildModel(dictionary=dictionary, corpus=corpus, modelType=modelType, num_topics =10)

        # get keywords from the given text
        kw = getKeywords(model, corpus)

        # add keywords as new columns
        colname = lstFormat + "_" + modelType
        df[colname] = kw

In [None]:
# let us look at the new columns
df.head()

#### Looking through the created columns of keywords, I opine that the LSI model trained on a TF-IDF corpus that has been filtered to remove the top 10% of the most frequent words and words that appear less than 5 times in the documents does a better job.

# Function to get best model using coherence scores?

In [None]:
# Determining optimum number of topics using coherence values for tfidfLtd

#def getOptimalTopics(min_topics=1,max_topics=10,step=1):
    

coherence_values = []
model_list = []
min_topics, max_topics, step = 1, 10, 1

for i in range(min_topics, max_topics, step):
    model = LsiModel(tfidfLtd, id2word=dictionaryLtd, num_topics=i)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dictionaryLtd, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

In [None]:
import matplotlib.pyplot as plt
x = range(min_topics, max_topics, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc="best")
plt.show()

### LDA Interactive for the best model

In [None]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_Ltd, tfidfLtd, dictionaryLtd)
vis

In [None]:
# Evaluating LDA models: Topic coherence
from gensim.models import CoherenceModel

goodLdaModel = LdaModel(corpus=tfidfNouns, id2word=dictionaryNouns, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=tfidfNouns, id2word=dictionaryNouns, iterations=1, num_topics=2)

goodcm = CoherenceModel(model=goodLdaModel, corpus=tfidfNouns, dictionary=dictionaryNouns, coherence='u_mass')
badcm  = CoherenceModel(model=badLdaModel, corpus=tfidfNouns, dictionary=dictionaryNouns, coherence='u_mass')
goodcm = CoherenceModel(model=goodLdaModel, texts=tfidfNouns, dictionary=dictionaryNouns, coherence='c_v')
badcm  = CoherenceModel(model=badLdaModel, texts=tfidfNouns, dictionary=dictionaryNouns, coherence='c_v')