# Getting the dataset and evaluation baseline ready

In [1]:
import sys
sys.path.append('..')

In [2]:
import re
import csv
import nltk
import math
import string
import numpy as np
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier
from sklearn import preprocessing
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from Chapter01.tokenization import tokenize_nltk

In [3]:
stemmer = SnowballStemmer('english')
bbc_dataset = 'bbc-text.csv'

In [4]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_token = [t for t in tokens if t not in string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_token]
    return stems

In [5]:
stop_words = stopwords.words('english')

In [6]:
def get_stopwrods(stop_words):
    stemmed_stopwords = [stemmer.stem(word) for word in stop_words]
    stop_words = stop_words + stemmed_stopwords
    return stop_words

In [7]:
stop_words = get_stopwrods(stop_words)

In [8]:
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', 
                            quotechar='"')
        data_read = [row for row in reader]
        return data_read

In [9]:
def get_data(filename):
    data = read_in_csv(filename)
    data_dict = {}
    for row in data[1:]:
        category = row[0]
        text = row[1]
        if category not in data_dict.keys():
            data_dict[category] = []
        data_dict[category].append(text)
    return data_dict

In [10]:
def get_stats(text, num_words=200):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stop_words and 
                re.search("[A-Za-z]", word)]
    freq_list = FreqDist(word_list)
    print(freq_list.most_common(num_words))
    return freq_list

In [11]:
data_dict = get_data('bbc-text.csv')

In [12]:
for topic in data_dict.keys():
    print(topic, ':', len(data_dict[topic]))

tech : 401
business : 510
sport : 511
entertainment : 386
politics : 417


In [13]:
business_data = data_dict["business"]
sports_data = data_dict["sport"]

In [14]:
business_string = ' '.join(business_data)
sports_string = ' '.join(sports_data)

In [15]:
get_stats(business_string)
get_stats(sports_string)

[('said', 1680), ('us', 813), ('year', 637), ('mr', 600), ('would', 463), ('also', 440), ('market', 425), ('new', 416), ('company', 415), ('growth', 384), ('last', 365), ('firm', 362), ('economy', 359), ('government', 340), ('bank', 335), ('sales', 316), ('could', 311), ('economic', 310), ('oil', 294), ('shares', 265), ('however', 256), ('world', 252), ('may', 251), ('years', 247), ('prices', 246), ('one', 243), ('chief', 236), ('two', 231), ('china', 223), ('business', 218), ('companies', 212), ('analysts', 209), ('uk', 207), ('deal', 206), ('rise', 203), ('expected', 200), ('group', 199), ('financial', 197), ('yukos', 196), ('firms', 193), ('since', 183), ('dollar', 180), ('december', 173), ('country', 173), ('months', 170), ('people', 170), ('stock', 168), ('first', 165), ('president', 165), ('three', 164), ('still', 164), ('many', 163), ('time', 159), ('european', 159), ('rate', 159), ('state', 158), ('trade', 158), ('told', 155), ('investment', 153), ('demand', 151), ('interest', 

FreqDist({'said': 941, 'game': 476, 'england': 459, 'first': 437, 'win': 415, 'would': 396, 'world': 379, 'last': 376, 'one': 355, 'two': 351, ...})

In [16]:
def create_vectorizer(text_list):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
                                      min_df=0.05, stop_words='english',
                                      use_idf=True,tokenizer=tokenize_and_stem,
                                      ngram_range=(1, 3))
    tfidf_vectorizer.fit_transform(text_list)
    return tfidf_vectorizer

In [17]:
def split_train_test(data, train_percentage):
    train_test_boarder = math.ceil(train_percentage * len(data))
    train_data = data[0:train_test_boarder]
    test_data = data[train_test_boarder:]
    return train_data, test_data

In [18]:
def get_labels(names):
    le = preprocessing.LabelEncoder()
    le.fit(names)
    return le
    

In [19]:
business_train_data, business_test_data = split_train_test(business_data, 0.8)
sports_train_data, sports_test_data = split_train_test(sports_data, 0.8)

In [20]:
train_data = business_train_data + sports_train_data
tfidf_vec = create_vectorizer(train_data)



In [21]:
le = get_labels(['business', 'sports'])

In [22]:
def create_data_matrix(input_data, vectorizer, label, le):
    vectors = vectorizer.transform(input_data).todense()
    labels = [label] * len(input_data)
    enc_labels = le.transform(labels)
    return vectors, enc_labels

In [23]:
def create_dataset(vectorizer, data_dict, le):
    business_news = data_dict['business']
    sports_news = data_dict['sports']
    
    business_vector, business_label = create_data_matrix(business_news, vectorizer,
                                                        'business', le)
    sports_vector, sports_label = create_data_matrix(sports_news, vectorizer,
                                                        'sports', le)
    
    all_data_matrix = np.vstack((business_vector, sports_vector))
    labels = np.concatenate([business_label, sports_label])
    return all_data_matrix, labels
    

In [24]:
train_data_dict = {'business':business_train_data, 
                   'sports':sports_train_data}
test_data_dict = {'business':business_test_data, 
                  'sports':sports_test_data}

In [25]:
(X_train, y_train) = create_dataset(tfidf_vec, train_data_dict, le)
(X_test, y_test) = create_dataset(tfidf_vec, test_data_dict, le)

In [26]:
def predict_trivial(X_train, y_train, X_test, y_test, le):
    dummy_clf = DummyClassifier(strategy='uniform', random_state=0)
    dummy_clf.fit(X_train, y_train)
    y_pred = dummy_clf.predict(X_test)
    print(dummy_clf.score(X_test, y_test))
    print(classification_report(y_test, y_pred, 
                                labels=le.transform(le.classes_),
                               target_names=le.classes_))

In [27]:
predict_trivial(X_train, y_train, X_test, y_test, le)

0.44607843137254904
              precision    recall  f1-score   support

    business       0.45      0.44      0.44       102
      sports       0.45      0.45      0.45       102

    accuracy                           0.45       204
   macro avg       0.45      0.45      0.45       204
weighted avg       0.45      0.45      0.45       204



# Performing rule-based text classification using keywords