## BBC News Classification

Completed as part of CU Boulder's Unsupervised Algorithms in Machine Learning course.

In [85]:
# Project dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import time
import re
import spacy
from collections import defaultdict
# Vectorizing word data
# importing all necessary modules
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

In [86]:
# Data import
train = pd.read_csv("data/BBC News Train.csv")
test = pd.read_csv("data/BBC News Test.csv")

### Exploratory Data Analysis (EDA)

In [78]:

print(train.head(5))
print("\nTrain:", train.info())
print("\nTest:", test.info())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB

Train: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Arti

In [79]:
# Check for any null/null-like values
null_like = [np.nan, None, [], {}, 'NaN', 'Null','NULL','None','NA','?','-', '.', '', ' ', '   ']

for df in [train, test]:
    for c in df.columns:
        string_null = np.array([x in null_like for x in df[c]])
        print(c, df[c].isnull().sum(), string_null.sum())

ArticleId 0 0
Text 0 0
Category 0 0
ArticleId 0 0
Text 0 0


In [87]:
# Top 20 most frequent words:
# ['the', '.', 'to', 'of', 'and', 'a', 'in', 's', 'for', 'is', 'that', 'it', 'on', 'said', 'was', 'he', 'be', 'with', 'has', 'as']
# So we need to do some cleaning!


# Using helpers from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in train['Text'])

t = time.time()

train['Clean Text'] = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Cleaned in: {} secs'.format(round((time.time() - t), 2)))

Cleaned in: 36.52 secs


In [19]:

# for installing missing corpora if needed
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [88]:
# iterate through each sentence in the file
data = []
word_freq = defaultdict(int)

for idx, article in train.iterrows():
    text = article['Clean Text']
    for i in sent_tokenize(text):
        temp = []
        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())
            word_freq[j.lower()] += 1
        data.append(temp)

# With stop words, unique word count was 28,178
print(len(word_freq))
print(data[0],data[20])

18437


In [92]:
word_freq_sorted = sorted(word_freq, key=word_freq.get, reverse=True)
print("Top 20 most frequent:\n", word_freq_sorted[:20])

Top 20 most frequent:
 ['s', 'say', 'year', 'mr', 'new', 'people', 'm', 'good', 'win', 'time', 'game', 'film', 'world', 't', 'uk', 'come', 'government', 'play', 'go', 'work']


In [71]:

model1 = gensim.models.Word2Vec(data, min_count = 5, vector_size = 200, window = 3)

# Print results
print("Score 'lawyer' <-> 'defence': ", model1.wv.similarity('lawyer', 'defence'))

print("Score 'lawyer' <-> 'economy':", model1.wv.similarity('lawyer', 'economy'))

Score 'lawyer' <-> 'defence':  0.98409194
Score 'lawyer' <-> 'economy': 0.868916


In [90]:
model1.wv.most_similar(positive=["economy"])

[('economic', 0.993360698223114),
 ('spending', 0.9909805059432983),
 ('figure', 0.9868961572647095),
 ('taxis', 0.9825689196586609),
 ('cut', 0.9815003275871277),
 ('income', 0.9801750183105469),
 ('raise', 0.9791306257247925),
 ('bank', 0.9778202176094055),
 ('deficit', 0.977458119392395),
 ('china', 0.9764323830604553)]

In [94]:
lists = sorted(word_freq.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples

plt.plot(x, y, color="purple")
plt.title("Word token frequency")
plt.show()