In [1]:
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:
stop_words = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]

Ensure you get the data from here http://mlg.ucd.ie/datasets/bbc.html and put it into the data folder when unzipping

In [3]:
path_to_bbc = "./data/bbc"
path_to_bbcsport = "./data/bbcsport"

bbc_business = os.path.join(path_to_bbc, "business")
bbc_entertainment = os.path.join(path_to_bbc, "entertainment")
bbc_tech = os.path.join(path_to_bbc, "tech")
bbc_politics = os.path.join(path_to_bbc, "politics")
bbc_sport = os.path.join(path_to_bbc, "sport")

bbcsport_athletics = os.path.join(path_to_bbcsport, "athletics")
bbcsport_cricket = os.path.join(path_to_bbcsport, "cricket")
bbcsport_football = os.path.join(path_to_bbcsport, "football")
bbcsport_rugby = os.path.join(path_to_bbcsport, "rugby")
bbcsport_tennis = os.path.join(path_to_bbcsport, "tennis")

In [4]:
sentences = []
labels = []

for path in [bbc_business, bbc_entertainment, bbc_tech, bbc_politics, bbc_sport, bbcsport_athletics, bbcsport_cricket, bbcsport_football,bbcsport_rugby, bbcsport_tennis]:
    for f_name in os.listdir(path):
        with open(os.path.join(path, f_name), encoding='utf-8', errors='ignore') as f: 
            data = f.readlines() 
        single_file_data = [x for x in data if len(x)>5]
        sentences+=single_file_data
        labels.append(path.split("/")[-1])

In [5]:
print("\nNumber of sentences : {}\n\n".format(len(sentences)))
print("Possible labels : {}\n".format(len(sorted(set(labels)))))

for i, label in enumerate(sorted(set(labels))):
    print("Label {:2d} ---> {}".format(i+1, label))


Number of sentences : 16953


Possible labels : 10

Label  1 ---> athletics
Label  2 ---> business
Label  3 ---> cricket
Label  4 ---> entertainment
Label  5 ---> football
Label  6 ---> politics
Label  7 ---> rugby
Label  8 ---> sport
Label  9 ---> tech
Label 10 ---> tennis


In [6]:
def remove_stop_words(list_of_sentences, stop_word_array):
    for stop_word in stop_word_array:
        list_of_sentences = [words.replace(" "+stop_word+" ", " ") for words in list_of_sentences]
        list_of_sentences = [words.replace("  ", " ") for words in list_of_sentences]
    
    return list_of_sentences

In [7]:
#Check
for stop_word in stop_words:
    print("{} in corpus : {}".format(stop_word, stop_word in sentences))

a in corpus : False
about in corpus : False
above in corpus : False
after in corpus : False
again in corpus : False
against in corpus : False
all in corpus : False
am in corpus : False
an in corpus : False
and in corpus : False
any in corpus : False
are in corpus : False
as in corpus : False
at in corpus : False
be in corpus : False
because in corpus : False
been in corpus : False
before in corpus : False
being in corpus : False
below in corpus : False
between in corpus : False
both in corpus : False
but in corpus : False
by in corpus : False
could in corpus : False
did in corpus : False
do in corpus : False
does in corpus : False
doing in corpus : False
down in corpus : False
during in corpus : False
each in corpus : False
few in corpus : False
for in corpus : False
from in corpus : False
further in corpus : False
had in corpus : False
has in corpus : False
have in corpus : False
having in corpus : False
he in corpus : False
he'd in corpus : False
he'll in corpus : False
he's in corpu

In [8]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [9]:
word_index = tokenizer.word_index
print("\nNumber of words in the word index :\n\n{}\n\n".format(len(word_index)))
print("Words in order of commonality :\n\n{}\n".format(list(word_index)[0:100]))


Number of words in the word index :

33915


Words in order of commonality :

['<OOV>', 'the', 'to', 'of', 'and', 'a', 'in', 'for', 'is', 'on', 'that', 'it', 'said', 'he', 'was', 'with', 'be', 'but', 'have', 'has', 'at', 'as', 'will', 'by', 'i', 'are', 'his', 'from', 'not', 'we', 'they', 'an', 'this', 'had', 'been', 'their', 'would', 'mr', 'which', 'up', 'who', 'were', 'more', 'year', 'after', 'also', 'one', 'out', 'new', 'its', 'there', 'us', 'all', 'about', 'first', 'over', 'people', 'if', 'when', 'can', 'last', 'you', 'or', 'two', 'time', 'could', 'than', 'against', 'world', 'now', 'game', 'so', 'into', 'some', 'she', 'what', 'just', 'back', 'only', 'other', 'them', 'no', "it's", 'before', 'three', 'do', 'years', 'very', 'best', 'get', 'england', 'made', 'make', 'win', 'told', 'like', 'her', 'my', 'being', 'off']



In [10]:
sequences = tokenizer.texts_to_sequences(sentences)

In [11]:
padded = pad_sequences(sequences, padding="post")

In [12]:
def see_padded_seq(idx):
    print("\nOriginal sentence : \n\n    {}\n\n".format(sentences[idx]))
    print("Padded tokenized version : \n\n    {}\n\n".format(padded[idx]))

In [13]:
see_padded_seq(10)


Original sentence : 

    The budget deficit hit a record $412bn in the 12 months to 30 September 2004, after reaching $377bn in the previous fiscal year. The CBO also forecast a total shortfall of $855bn for the years from 2006 to 2015, an improvement on previous projections. However, analysts say the new figures fail to take into account the potential $2-$3.8 trillion costs of the president's plan to revamp state pensions and extend tax cuts. The figure could also be worsened by any further military costs. Republicans have blamed the size of the deficit on slow economic conditions after the 11 September attacks and ongoing military operations in Iraq and Afghanistan. One of President George W Bush's election pledges was to halve the budget deficit within five years. But Democrats have accused the president of excluding Iraq-related costs from previous budgets to meet the aim of reducing the deficit, a charge which the administration denies. On Tuesday, the US administration asked Co

In [14]:
print("\nShape of Padded Sequence Matrix : \n\n{}\n    {}".format("(Examples, Padded Size)",padded.shape))


Shape of Padded Sequence Matrix : 

(Examples, Padded Size)
    (16953, 450)


In [15]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

In [16]:
label_word_index = label_tokenizer.word_index
print("\nNumber of words in the word index :\n\n{}\n\n".format(len(label_word_index)))
print("Words in order of commonality :\n\n{}\n".format(list(label_word_index)[0:100]))


Number of words in the word index :

10


Words in order of commonality :

['sport', 'business', 'politics', 'tech', 'entertainment', 'football', 'rugby', 'cricket', 'athletics', 'tennis']



In [17]:
label_sequences = label_tokenizer.texts_to_sequences(labels)

In [18]:
label_padded = pad_sequences(label_sequences, padding="post")

In [22]:
def see_padded_label_seq(idx):
    print("\nOriginal label : \n\n    {}\n\n".format(labels[idx]))
    print("Padded tokenized version : \n\n    {}\n\n".format(label_padded[idx]))

In [23]:
see_padded_label_seq(10)


Original label : 

    business


Padded tokenized version : 

    [2]




In [21]:
print("\nShape of Padded Sequence Matrix : \n\n{}\n    {}".format("(Examples, Padded Size)",label_padded.shape))


Shape of Padded Sequence Matrix : 

(Examples, Padded Size)
    (2962, 1)
