# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

## Data Prep

### Import Libraries

In [None]:
import pprint
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

### Load Data

In [None]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]
df.head(3)

### Initial Prep

### Data Transformation

In [None]:
# transform data (options: TF-IDF, ...)
def vectorizeStep(inputData, fittingData=df.text, outputFormat="tfidf"):
    
    # TF-IDF input
    if outputFormat == "tfidf":
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, 
                            analyzer='word', stop_words='english'
                            )

        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
    
    return transformedData

## Modeling

### LDA

In [None]:
def preprocess_text(df, colName: str):
    corpus=[]
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in df[colName]:
        words=[w for w in word_tokenize(txt) if (w not in stop)]
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
        
    return corpus

In [None]:
txt = df['text'].str.split()
txt = txt.values.tolist()

In [None]:
corpus = preprocess_text(df, "text")

In [None]:
# BOW model
dic_bow = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic_bow.doc2bow(doc) for doc in corpus]

In [None]:
# LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic_bow,                                    
                                   passes = 10,
                                   workers = 4)

In [None]:
lda_model.print_topics(1)

In [None]:
topics = lda_model.show_topics()
topics[0]

In [None]:
# Top keywords in each topic
topic= []
keywords = []

for i in range(len(df)):
    
    for index, score in sorted(lda_model[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        topic.append(index)

        elements = lda_model.print_topic(index, 5).split("+")
        keywords.append([x.strip().replace('"', '').split("*")[1] for x in elements])

In [None]:
keywords[0]

In [None]:
len(bow_corpus), len(topic), len(keywords)

In [None]:
sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1])

In [None]:
for topicID, value in topics[:1]:
    elements = value.split("+")[:5]
    for element in elements:
        word = element.split("*")[1]
        print(word)


        # print(f"The topic would be: {}")

In [None]:
print(topics)

### LDA Interactive

In [None]:
# !pip install pyldavis

In [None]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, bow_corpus, dic_bow)
vis

### Evaluate

### Top Keywords

### Add Keywords to DataFrame