# Document Tagging: BBC News Articles 

This corpus used in this project includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. 

The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this project, I will use only the text column.

More information on this data set as well as a paper written using this data set is available here http://mlg.ucd.ie/datasets/bbc.html.

## Data Prep

### Import Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

import random
random.seed(42)

### Load Data

In [2]:
df = pd.read_csv("data/BBC-articles.csv")
df = df[['text']][:100]
df.head(3)

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...


### Initial Prep

### Data Transformation

In [3]:
# transform data (options: TF-IDF, ...)
def vectorizeStep(inputData, fittingData=df.text, outputFormat="tfidf"):
    
    # TF-IDF input
    if outputFormat == "tfidf":
        vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, 
                            analyzer='word', stop_words='english'
                            )

        vectorizer.fit(fittingData)
        
        transformedData = vectorizer.transform(inputData)
    
    return transformedData

## Modeling

### LDA

In [4]:
def preprocess_text(df, colName: str):
    corpus=[]
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))

    for txt in df[colName]:
        words=[w for w in word_tokenize(txt) if (w not in stop)]
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
        
    return corpus

In [5]:
txt = df['text'].str.split()
txt = txt.values.tolist()

In [6]:
corpus = preprocess_text(df, "text")

In [7]:
# BOW model
dic_bow = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic_bow.doc2bow(doc) for doc in corpus]

In [13]:
# LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic_bow,                                    
                                   passes = 10,
                                   workers = 2)

In [14]:
lda_model.show_topics()

[(0,
  '0.012*"said" + 0.006*"year" + 0.005*"game" + 0.005*"could" + 0.004*"would" + 0.004*"england" + 0.004*"one" + 0.003*"time" + 0.003*"new" + 0.003*"three"'),
 (1,
  '0.012*"said" + 0.005*"howard" + 0.004*"year" + 0.003*"election" + 0.003*"country" + 0.003*"government" + 0.003*"would" + 0.003*"also" + 0.003*"international" + 0.003*"side"'),
 (2,
  '0.010*"said" + 0.008*"people" + 0.007*"party" + 0.006*"government" + 0.006*"music" + 0.005*"would" + 0.004*"also" + 0.004*"new" + 0.004*"year" + 0.003*"bbc"'),
 (3,
  '0.014*"said" + 0.006*"right" + 0.005*"would" + 0.004*"hague" + 0.004*"year" + 0.004*"human" + 0.004*"firm" + 0.004*"also" + 0.003*"break" + 0.003*"yukos"'),
 (4,
  '0.009*"said" + 0.006*"year" + 0.005*"film" + 0.004*"firm" + 0.004*"company" + 0.003*"new" + 0.003*"would" + 0.003*"west" + 0.003*"also" + 0.003*"people"')]

### LDA Interactive

In [None]:
# !pip install pyldavis

In [15]:
# interacting with LDA output
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, bow_corpus, dic_bow)
vis

  default_term_info = default_term_info.sort_values(


### Evaluate

In [None]:
lda_model.

### Top Keywords

### Add Keywords to DataFrame