In [1]:
import itertools
import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, NamedTuple
import json

import numpy as np
from numpy.linalg import norm, svd
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

File IO and processing

In [0]:
class Document(NamedTuple):
    doc_id: str
    category: str
    link: str
    date: str
    headline: List[str]
    short_description: List[str]

    def sections(self):
        return [self.headline, self.short_description]

    def __repr__(self):
        return (f"doc_id: {self.doc_id}\n" +
            f"  category: {self.category}\n" +
            f"  link: {self.link}\n" +
            f"  date: {self.date}" +
            f"  headline: {self.headline}" +
            f"  short_description: {self.short_description}")


def read_stopwords(file):
    with open(file) as f:
        return set([x.strip() for x in f.readlines()])

stopwords = read_stopwords('common_words')

stemmer = SnowballStemmer('english')


def read_docs(file):
    '''
    Reads the corpus into a list of Documents
    '''
    docs = []  # empty 0 index
    categories = set()
    with open(file) as f:
        i = 0
        for line in f:
            json_dict = json.loads(line)
            doc_id = i
            category = json_dict['category']
            categories.add(category)
            link = json_dict['link']
            date = json_dict['date']
            headline = []
            short_description = []

            ws = word_tokenize(json_dict['headline'])
            for word in ws:
                headline.append(word.lower())

            ws = word_tokenize(json_dict['short_description'])
            for word in ws:
                short_description.append(word.lower())
            docs.append(Document(doc_id, category, link, date, headline, short_description))
            i += 1

    return docs, categories


def stem_doc(doc: Document):
    return Document(doc.doc_id, doc.category, doc.link, doc.date, *[[stemmer.stem(word) for word in sec]
        for sec in doc.sections()])


def stem_docs(docs: List[Document]):
    return [stem_doc(doc) for doc in docs]


def remove_stopwords_doc(doc: Document):
    return Document(doc.doc_id, doc.category, doc.link, doc.date, *[[word for word in sec if word not in stopwords]
        for sec in doc.sections()])


def remove_stopwords(docs: List[Document]):
    return [remove_stopwords_doc(doc) for doc in docs]


def writelines(filename, data):
    with open(filename, 'w') as fout:
        for d in data:
            print(d, file=fout)


def process_docs(training_docs, testing_docs, stem):
    processed_traindocs = training_docs
    processed_devdocs = testing_docs

    if stem:
        processed_traindocs = stem_docs(processed_traindocs)
        processed_devdocs = stem_docs(processed_devdocs)
    return processed_traindocs, processed_devdocs

Position weighting

In [0]:
class TermWeights(NamedTuple):
    headline: float
    short_description: float

Term-Document Matrix

In [0]:
def compute_doc_freqs(docs: List[Document]):
    '''
    Computes document frequency, i.e. how many documents contain a specific word
    '''
    freq = Counter()
    for doc in docs:
        words = set()
        for sec in doc.sections():
            for word in sec:
                words.add(word)
        for word in words:
            freq[word] += 1
    return freq


def compute_tf(doc: Document, doc_freqs, weights, N):
    vec = defaultdict(float)
    for word in doc.headline:
        vec[word] += weights.headline
    for word in doc.short_description:
        vec[word] += weights.short_description
    return dict(vec)  # convert back to a regular dict


def compute_tfidf(doc: Document, doc_freqs, weights, N):
    tf = compute_tf(doc, doc_freqs, weights, N)
    vec = defaultdict(float)
    for k, v in tf.items():
        if doc_freqs[k] == 0:
            continue
        vec[k] = v * np.log(float(N) / (doc_freqs[k]))

    return dict(vec)  # TODO: implement

def compute_boolean(doc, doc_freqs, weights, N):
    vec = defaultdict(float)
    for word in doc.headline:
        vec[word] = 1 * weights.headline
    for word in doc.short_description:
        vec[word] = 1 * weights.short_description
    return dict(vec)  # TODO: implement

Vector Similarity

In [0]:
def dictdot(x: Dict[str, float], y: Dict[str, float]):
    '''
    Computes the dot product of vectors x and y, represented as sparse dictionaries.
    '''
    keys = list(x.keys()) if len(x) < len(y) else list(y.keys())
    return sum(x.get(key, 0) * y.get(key, 0) for key in keys)


def cosine_sim(x, y):
    '''
    Computes the cosine similarity between two sparse term vectors represented as dictionaries.
    '''
    num = dictdot(x, y)
    if num == 0:
        return 0
    return num / (norm(list(x.values())) * norm(list(y.values())))

Base Model

In [0]:
file = 'News_Category_Dataset.json'

term_funcs = {
    'tfidf': compute_tfidf,
    'tf': compute_tf,
    'boolean': compute_boolean
}

sim_funcs = {
    'cosine': cosine_sim
}

region_weights = {
    0: TermWeights(headline=1, short_description=1),
    1: TermWeights(headline=3, short_description=1),
    2: TermWeights(headline=1, short_description=3)
}

permutations = [
    term_funcs,
    [False, True],  # stem
    sim_funcs,
    region_weights
]

# permutations = [
#     ('tfidf', False, 'cosine', 0)
# ]

In [0]:
def sep_docs(file):
    docs = defaultdict(list)
    for d in file:
        docs[d.category].append(d)
    return docs

In [0]:
docs, categories = read_docs(file)
sep_doc = sep_docs(docs)

In [21]:
training_docs = []
testing_docs = []
for c in categories:
    print(c, len(sep_doc[c]))
    split = int(len(sep_doc[c]) * 0.9)
    training_docs += (sep_doc[c][:split])
    testing_docs += (sep_doc[c][split:])

TASTE 2096
HEALTHY LIVING 6694
WELLNESS 17827
STYLE & BEAUTY 9649
LATINO VOICES 1129
WORLD NEWS 2177
QUEER VOICES 6314
THE WORLDPOST 3664
COMEDY 5175
COLLEGE 1144
ENVIRONMENT 1323
STYLE 2254
GOOD NEWS 1398
HOME & LIVING 4195
MEDIA 2815
CULTURE & ARTS 1030
WOMEN 3490
PARENTING 8677
GREEN 2622
ENTERTAINMENT 16058
TECH 2082
FIFTY 1401
RELIGION 2556
IMPACT 3459
CRIME 3405
ARTS 1509
WEIRD NEWS 2670
DIVORCE 3426
TRAVEL 9887
WEDDINGS 3651
SCIENCE 2178
FOOD & DRINK 6226
PARENTS 3955
BLACK VOICES 4528
WORLDPOST 2579
BUSINESS 5937
POLITICS 32739
ARTS & CULTURE 1339
MONEY 1707
EDUCATION 1004
SPORTS 4884


In [22]:
import random
random.shuffle(training_docs)
random.shuffle(testing_docs)
print("Training data", len(training_docs))
print("Testing data",len(testing_docs))

Training data 180752
Testing data 20101


In [24]:
print('category', 'accuracy', 'precision', 'recall', 'F1_Score', sep='\t')

for term, stem, sim, weight in itertools.product(*permutations):

    processed_traindocs, processed_devdocs = process_docs(training_docs, testing_docs, stem)
    doc_freqs_train = compute_doc_freqs(processed_traindocs)
    doc_freqs_test = compute_doc_freqs(processed_devdocs)
    N_train = len(processed_traindocs)
    N_test = len(processed_devdocs)
    metrics = []

    # create Vprofile
    vprofile = defaultdict(lambda: defaultdict(float))
    train_vector_sense = defaultdict(list)

    for train_doc in processed_traindocs:
        train_vector = term_funcs[term](train_doc, doc_freqs_train, region_weights[weight], N_train)
        train_vector_sense[train_doc.category].append(train_vector)

    # do centriod
    for category, vecs in train_vector_sense.items():
        for doc in vecs:
            for word in doc:
                vprofile[category][word] += doc[word]

    for category, vec in vprofile.items():
        for word in vec:
            vec[word] /= len(vec)

    # calculate correctness
    correct_count = 0
    sims = defaultdict(float)
    tp = defaultdict(float)
    fn = defaultdict(float)
    fp = defaultdict(float)

    for test_doc in processed_devdocs:
        test_vector = term_funcs[term](test_doc, doc_freqs_test, region_weights[weight], N_test)
        for category, vec in vprofile.items():
            sims[category] = sim_funcs[sim](test_vector, vec)

        predict = max(sims, key=lambda key: sims[key])
        if predict == test_doc.category:
            correct_count += 1
            tp[predict] += 1
        else:
            fp[predict] += 1
            fn[test_doc.category] += 1

        metrics.append([
            test_doc.doc_id, predict, test_doc.category
        ])

    total_acc = correct_count / len(processed_devdocs)
    accuracy = defaultdict(float)
    precison = defaultdict(float)
    recall = defaultdict(float)
    f1_score = defaultdict(float)

    for category in categories:
        accuracy[category] = tp[category] / (tp[category] + fp[category]) if (tp[category] + fp[category]) != 0 else 0
        precison[category] = tp[category] / (tp[category] + fn[category]) if (tp[category] + fn[category]) != 0 else 0
        recall[category] = (tp[category] + N_test - fn[category]) / (tp[category] + fp[category] + N_test)
        f1_score[category] = 2 * precison[category] * recall[category] / (precison[category] + recall[category])
        print(category, accuracy[category], precison[category], recall[category], f1_score[category], sep='\t')

    print(term, stem, sim, weight, str(total_acc))

    break

category	accuracy	precision	recall	F1_Score
TASTE	0.21329639889196675	0.36666666666666664	0.9796207604339752	0.5336071207310462
HEALTHY LIVING	0.18547595682041218	0.282089552238806	0.9379261363636363	0.43373075659866855
WELLNESS	0.544189852700491	0.37296690970274815	0.921446325563945	0.5310035143942722
STYLE & BEAUTY	0.5722379603399433	0.627979274611399	0.9616257088846881	0.7597875212808368
LATINO VOICES	0.3655913978494624	0.3008849557522124	0.9931662870159453	0.46185001713555796
WORLD NEWS	0.16923076923076924	0.15137614678899083	0.9829030350808041	0.2623482410608431
QUEER VOICES	0.7753623188405797	0.5079113924050633	0.9803070923714355	0.6691344656234546
THE WORLDPOST	0.41586538461538464	0.4713896457765668	0.9787005897548374	0.6363042975139852
COMEDY	0.5568513119533528	0.3687258687258687	0.976570142829192	0.5353270524755461
COLLEGE	0.2524752475247525	0.4434782608695652	0.9894104319558686	0.6124439669975816
ENVIRONMENT	0.16929133858267717	0.3233082706766917	0.9852124785065095	0.48685103

In [29]:
selected_categories = ['QUEER VOICES', 'SPORTS', 'COMEDY', 'HEALTHY LIVING', 'TRAVEL']
training_docs = []
testing_docs = []
for c in selected_categories:
    print(c, len(sep_doc[c]))
    split = int(len(sep_doc[c]) * 0.9)
    training_docs += (sep_doc[c][:split])
    testing_docs += (sep_doc[c][split:])

QUEER VOICES 6314
SPORTS 4884
COMEDY 5175
HEALTHY LIVING 6694
TRAVEL 9887


In [30]:
import random
random.shuffle(training_docs)
random.shuffle(testing_docs)
print("Training data", len(training_docs))
print("Testing data",len(testing_docs))

Training data 29656
Testing data 3298


In [31]:
print('category', 'accuracy', 'precision', 'recall', 'F1_Score', sep='\t')

for term, stem, sim, weight in itertools.product(*permutations):

    processed_traindocs, processed_devdocs = process_docs(training_docs, testing_docs, stem)
    doc_freqs_train = compute_doc_freqs(processed_traindocs)
    doc_freqs_test = compute_doc_freqs(processed_devdocs)
    N_train = len(processed_traindocs)
    N_test = len(processed_devdocs)
    metrics = []

    # create Vprofile
    vprofile = defaultdict(lambda: defaultdict(float))
    train_vector_sense = defaultdict(list)

    for train_doc in processed_traindocs:
        train_vector = term_funcs[term](train_doc, doc_freqs_train, region_weights[weight], N_train)
        train_vector_sense[train_doc.category].append(train_vector)

    # do centriod
    for category, vecs in train_vector_sense.items():
        for doc in vecs:
            for word in doc:
                vprofile[category][word] += doc[word]

    for category, vec in vprofile.items():
        for word in vec:
            vec[word] /= len(vec)

    # calculate correctness
    correct_count = 0
    sims = defaultdict(float)
    tp = defaultdict(float)
    fn = defaultdict(float)
    fp = defaultdict(float)

    for test_doc in processed_devdocs:
        test_vector = term_funcs[term](test_doc, doc_freqs_test, region_weights[weight], N_test)
        for category, vec in vprofile.items():
            sims[category] = sim_funcs[sim](test_vector, vec)

        predict = max(sims, key=lambda key: sims[key])
        if predict == test_doc.category:
            correct_count += 1
            tp[predict] += 1
        else:
            fp[predict] += 1
            fn[test_doc.category] += 1

        metrics.append([
            test_doc.doc_id, predict, test_doc.category
        ])

    total_acc = correct_count / len(processed_devdocs)
    accuracy = defaultdict(float)
    precison = defaultdict(float)
    recall = defaultdict(float)
    f1_score = defaultdict(float)

    for category in selected_categories:
        accuracy[category] = tp[category] / (tp[category] + fp[category]) if (tp[category] + fp[category]) != 0 else 0
        precison[category] = tp[category] / (tp[category] + fn[category]) if (tp[category] + fn[category]) != 0 else 0
        recall[category] = (tp[category] + N_test - fn[category]) / (tp[category] + fp[category] + N_test)
        f1_score[category] = 2 * precison[category] * recall[category] / (precison[category] + recall[category])
        print(category, accuracy[category], precison[category], recall[category], f1_score[category], sep='\t')

    print(term, stem, sim, region_weights[weight], str(total_acc))

category	accuracy	precision	recall	F1_Score
QUEER VOICES	0.8393782383419689	0.7689873417721519	0.9383543977302038	0.8452703255073936
SPORTS	0.8123791102514507	0.8588957055214724	0.9564875491480996	0.9050684435200336
COMEDY	0.7901234567901234	0.6177606177606177	0.9235754793410748	0.7403298472528512
HEALTHY LIVING	0.7613168724279835	0.8283582089552238	0.9282344176806556	0.8754569364134407
TRAVEL	0.7921348314606742	0.8554095045500506	0.9163994502977554	0.8848547667674209
tfidf True cosine TermWeights(headline=1, short_description=1) 0.7965433596118859
QUEER VOICES	0.8862385321100917	0.7642405063291139	0.9450949778818631	0.8451001820265137
SPORTS	0.8501026694045175	0.8466257668711656	0.9608982826948481	0.9001498438342086
COMEDY	0.7967032967032966	0.5598455598455598	0.9175314036045876	0.6953890510480077
HEALTHY LIVING	0.7378516624040921	0.8611940298507462	0.9269607843137255	0.8928679855158446
TRAVEL	0.7589285714285714	0.8594539939332659	0.9074241738343142	0.8827878963254973
tfidf True cosin