# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

## Data Prep

### Import Libraries

In [1]:
import pprint
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

### Load Data

In [2]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]
df.head(3)

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...


### Initial Prep

### Data Transformation

In [3]:
# transform data (options: TF-IDF, ...)
def vectorizeStep(inputData, fittingData=df.text, outputFormat="tfidf"):
    
    # TF-IDF input
    if outputFormat == "tfidf":
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, 
                            analyzer='word', stop_words='english'
                            )

        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
    
    return transformedData

## Modeling

### LDA

In [4]:
'''
This function takes as input a df and name of column (containing sentences) in the df.
The input is split to tokens which are lemmatized, and stopwords removed.
The output is a list of lists.  
'''
def preprocess_text(text):
    cleanText=[]
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in text:
        words = [w for w in word_tokenize(txt) if (w not in stop)]
        words = [lem.lemmatize(w) for w in words if len(w)>2]
        cleanText.append(words)        
    return cleanText

In [5]:
preprocessedText = preprocess_text(df['text'])

In [None]:
'''
txt = df['text'].str.split()
txt = txt.values.tolist()
txt[0]
'''

In [13]:
# BOW model
dic_bow = gensim.corpora.Dictionary(preprocessedText)
bow_corpus = [dic_bow.doc2bow(doc) for doc in preprocessedText]

# tfidf model
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# LDA using BoW
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dic_bow, passes=2, workers=2)

# LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dic_bow, passes=2, workers=4)

In [38]:
# testing on a select output using LDA BoW model
x = 88

for index, score in sorted(lda_model[bow_corpus[x]], key=lambda tup: -1*tup[1]):
    print("\nTopic No: {}\t\nScore: {}\t\nTopic Model: {}".format(index, score, lda_model.print_topic(index, 10)))



Topic No: 2	
Score: 0.9973008036613464	
Topic Model: 0.012*"said" + 0.006*"year" + 0.005*"government" + 0.005*"would" + 0.004*"price" + 0.004*"music" + 0.004*"last" + 0.004*"new" + 0.003*"market" + 0.003*"people"


In [39]:
# testing on a select output using LDA TF-IDF model
for index, score in sorted(lda_model_tfidf[bow_corpus[x]], key=lambda tup: -1*tup[1]):
    print("\nTopic No: {}\t\nScore: {}\t\nTopic Model: {}".format(index, score, lda_model_tfidf.print_topic(index, 10)))


Topic No: 3	
Score: 0.9741422533988953	
Topic Model: 0.001*"music" + 0.001*"rate" + 0.001*"fox" + 0.001*"pop" + 0.001*"government" + 0.001*"bank" + 0.001*"party" + 0.001*"election" + 0.001*"ice" + 0.001*"pension"

Topic No: 2	
Score: 0.023825105279684067	
Topic Model: 0.001*"fiat" + 0.001*"child" + 0.001*"film" + 0.001*"club" + 0.001*"hendrix" + 0.001*"coach" + 0.001*"rugby" + 0.001*"human" + 0.001*"project" + 0.001*"ufj"


In [44]:
print(lda_model[bow_corpus[x]], "\n",lda_model_tfidf[bow_corpus[x]])

[(2, 0.99730057)] 
 [(2, 0.023875587), (3, 0.9740918)]


In [None]:
'''
# print the 5 topics
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
'''

In [None]:
# LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic_bow,                                    
                                   passes = 10,
                                   workers = 4)

In [None]:
# Top keywords in each topic
topic= []
keywords = []

for i in range(len(df)):
    
    for index, score in sorted(lda_model[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        topic.append(index)

        elements = lda_model.print_topic(index, 5).split("+")
        keywords.append([x.strip().replace('"', '').split("*")[1] for x in elements])

In [None]:
keywords[0]

In [None]:
len(bow_corpus), len(topic), len(keywords)

In [None]:
sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1])

In [None]:
for topicID, value in topics[:1]:
    elements = value.split("+")[:5]
    for element in elements:
        word = element.split("*")[1]
        print(word)


        # print(f"The topic would be: {}")

In [None]:
print(topics)

### LDA Interactive

In [None]:
# !pip install pyldavis

In [None]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, bow_corpus, dic_bow)
vis

### Evaluate

### Top Keywords

### Add Keywords to DataFrame