In [1]:
import itertools
import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, NamedTuple
import json

import numpy as np
from numpy.linalg import norm, svd
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

File IO and processing

In [0]:
class Document(NamedTuple):
    doc_id: str
    category: str
    link: str
    date: str
    headline: List[str]
    short_description: List[str]

    def sections(self):
        return [self.headline, self.short_description]

    def __repr__(self):
        return (f"doc_id: {self.doc_id}\n" +
            f"  category: {self.category}\n" +
            f"  link: {self.link}\n" +
            f"  date: {self.date}" +
            f"  headline: {self.headline}" +
            f"  short_description: {self.short_description}")


def read_stopwords(file):
    with open(file) as f:
        return set([x.strip() for x in f.readlines()])

stopwords = read_stopwords('common_words')

stemmer = SnowballStemmer('english')


def read_docs(file):
    '''
    Reads the corpus into a list of Documents
    '''
    docs = []  # empty 0 index
    categories = set()
    with open(file) as f:
        i = 0
        for line in f:
            json_dict = json.loads(line)
            doc_id = i
            category = json_dict['category']
            categories.add(category)
            link = json_dict['link']
            date = json_dict['date']
            headline = []
            short_description = []

            ws = word_tokenize(json_dict['headline'])
            for word in ws:
                headline.append(word.lower())

            ws = word_tokenize(json_dict['short_description'])
            for word in ws:
                short_description.append(word.lower())
            docs.append(Document(doc_id, category, link, date, headline, short_description))
            i += 1

    return docs, categories


def stem_doc(doc: Document):
    return Document(doc.doc_id, doc.category, doc.link, doc.date, *[[stemmer.stem(word) for word in sec]
        for sec in doc.sections()])


def stem_docs(docs: List[Document]):
    return [stem_doc(doc) for doc in docs]


def remove_stopwords_doc(doc: Document):
    return Document(doc.doc_id, doc.category, doc.link, doc.date, *[[word for word in sec if word not in stopwords]
        for sec in doc.sections()])


def remove_stopwords(docs: List[Document]):
    return [remove_stopwords_doc(doc) for doc in docs]


def writelines(filename, data):
    with open(filename, 'w') as fout:
        for d in data:
            print(d, file=fout)


def process_docs(training_docs, testing_docs, stem):
    processed_traindocs = training_docs
    processed_devdocs = testing_docs

    if stem:
        processed_traindocs = stem_docs(processed_traindocs)
        processed_devdocs = stem_docs(processed_devdocs)
    return processed_traindocs, processed_devdocs

Position weighting

In [0]:
class TermWeights(NamedTuple):
    headline: float
    short_description: float

Term-Document Matrix

In [0]:
def compute_doc_freqs(docs: List[Document]):
    '''
    Computes document frequency, i.e. how many documents contain a specific word
    '''
    freq = Counter()
    for doc in docs:
        words = set()
        for sec in doc.sections():
            for word in sec:
                words.add(word)
        for word in words:
            freq[word] += 1
    return freq


def compute_tf(doc: Document, doc_freqs, weights, N):
    vec = defaultdict(float)
    for word in doc.headline:
        vec[word] += weights.headline
    for word in doc.short_description:
        vec[word] += weights.short_description
    return dict(vec)  # convert back to a regular dict


def compute_tfidf(doc: Document, doc_freqs, weights, N):
    tf = compute_tf(doc, doc_freqs, weights, N)
    vec = defaultdict(float)
    for k, v in tf.items():
        if doc_freqs[k] == 0:
            continue
        vec[k] = v * np.log(float(N) / (doc_freqs[k]))

    return dict(vec)  # TODO: implement

def compute_boolean(doc, doc_freqs, weights, N):
    vec = defaultdict(float)
    for word in doc.headline:
        vec[word] = 1 * weights.headline
    for word in doc.short_description:
        vec[word] = 1 * weights.short_description
    return dict(vec)  # TODO: implement

Vector Similarity

In [0]:
def dictdot(x: Dict[str, float], y: Dict[str, float]):
    '''
    Computes the dot product of vectors x and y, represented as sparse dictionaries.
    '''
    keys = list(x.keys()) if len(x) < len(y) else list(y.keys())
    return sum(x.get(key, 0) * y.get(key, 0) for key in keys)


def cosine_sim(x, y):
    '''
    Computes the cosine similarity between two sparse term vectors represented as dictionaries.
    '''
    num = dictdot(x, y)
    if num == 0:
        return 0
    return num / (norm(list(x.values())) * norm(list(y.values())))

Base Model

In [0]:
file = 'News_Category_Dataset.json'

term_funcs = {
    'tfidf': compute_tfidf,
    'tf': compute_tf,
    'boolean': compute_boolean
}

sim_funcs = {
    'cosine': cosine_sim
}

region_weights = {
    0: TermWeights(headline=1, short_description=1),
    1: TermWeights(headline=3, short_description=1),
    2: TermWeights(headline=1, short_description=3)
}

permutations = [
    term_funcs,
    [True, False],  # stem
    sim_funcs,
    region_weights
]

# permutations = [
#     ('tfidf', False, 'cosine', 0)
# ]

In [0]:
docs, categories = read_docs(file)
split = int(len(docs) * 0.9)

training_docs = docs[:split]
testing_docs = docs[split:]

In [8]:
print('category', 'accuracy', 'precision', 'recall', 'F1_Score', sep='\t')

for term, stem, sim, weight in itertools.product(*permutations):

    processed_traindocs, processed_devdocs = process_docs(training_docs, testing_docs, stem)
    doc_freqs_train = compute_doc_freqs(processed_traindocs)
    doc_freqs_test = compute_doc_freqs(processed_devdocs)
    N_train = len(processed_traindocs)
    N_test = len(processed_devdocs)
    metrics = []

    # create Vprofile
    vprofile = defaultdict(lambda: defaultdict(float))
    train_vector_sense = defaultdict(list)

    for train_doc in processed_traindocs:
        train_vector = term_funcs[term](train_doc, doc_freqs_train, region_weights[weight], N_train)
        train_vector_sense[train_doc.category].append(train_vector)

    # do centriod
    for category, vecs in train_vector_sense.items():
        for doc in vecs:
            for word in doc:
                vprofile[category][word] += doc[word]

    for category, vec in vprofile.items():
        for word in vec:
            vec[word] /= len(vec)

    # calculate correctness
    correct_count = 0
    sims = defaultdict(float)
    tp = defaultdict(float)
    fn = defaultdict(float)
    fp = defaultdict(float)

    for test_doc in processed_devdocs:
        test_vector = term_funcs[term](test_doc, doc_freqs_test, region_weights[weight], N_test)
        for category, vec in vprofile.items():
            sims[category] = sim_funcs[sim](test_vector, vec)

        predict = max(sims, key=lambda key: sims[key])
        if predict == test_doc.category:
            correct_count += 1
            tp[predict] += 1
        else:
            fp[predict] += 1
            fn[test_doc.category] += 1

        metrics.append([
            test_doc.doc_id, predict, test_doc.category
        ])

    total_acc = correct_count / len(processed_devdocs)
    accuracy = defaultdict(float)
    precison = defaultdict(float)
    recall = defaultdict(float)
    f1_score = defaultdict(float)

    for category in categories:
        accuracy[category] = tp[category] / (tp[category] + fp[category]) if (tp[category] + fp[category]) != 0 else 0
        precison[category] = tp[category] / (tp[category] + fn[category]) if (tp[category] + fn[category]) != 0 else 0
        recall[category] = (tp[category] + N_test - fn[category]) / (tp[category] + fp[category] + N_test)
        f1_score[category] = 2 * precison[category] * recall[category] / (precison[category] + recall[category])
        print(category, accuracy[category], precison[category], recall[category], f1_score[category], sep='\t')

    print(term, stem, sim, weight, str(total_acc))

    break

category	accuracy	precision	recall	F1_Score
TASTE	0.0	0	0.974291812184711	0.0
HEALTHY LIVING	0.0	0	0.9345803089521683	0.0
WELLNESS	0.8335396039603961	0.29741664826672554	0.8409823979356741	0.4394279262262694
STYLE & BEAUTY	0.9157846295444086	0.5243741765480896	0.9106878116716833	0.6655338588267137
LATINO VOICES	0.0	0	0.9994029256642452	0.0
WORLD NEWS	0.0	0	0.9950953678474115	0.0
QUEER VOICES	0.8167539267015707	0.5492957746478874	0.9919613355032796	0.707059408345678
THE WORLDPOST	0.0	0	0.9960823208529631	0.0
COMEDY	0.4336569579288026	0.3952802359882006	0.9813679823486149	0.563564987027232
COLLEGE	0.0	0	0.9928327813751174	0.0
ENVIRONMENT	0.34274193548387094	0.2750809061488673	0.9809678371200944	0.42967364600134333
STYLE	0.0	0	0.9762807426849421	0.0
GOOD NEWS	0.0	0	0.9905804606204074	0.0
HOME & LIVING	0.6114221724524076	0.5465465465465466	0.9618666285332952	0.6970303531529634
MEDIA	0.0	0	0.9913627165490351	0.0
CULTURE & ARTS	0.3624454148471616	0.4256410256410256	0.9873000246123554	0.59483

In [0]:
def sep_docs(file):
    docs = defaultdict(list)
    for d in file:
        docs[d.category].append(d)
    return docs

In [0]:
docs, categories = read_docs(file)
sep_doc = sep_docs(docs)

In [16]:
selected_categories = ['QUEER VOICES', 'SPORTS', 'COMEDY', 'HEALTHY LIVING', 'TRAVEL']
training_docs = []
testing_docs = []
for c in selected_categories:
    print(c, len(sep_doc[c]))
    split = int(len(sep_doc[c]) * 0.9)
    training_docs += (sep_doc[c][:split])
    testing_docs += (sep_doc[c][split:])

QUEER VOICES 6314
SPORTS 4884
COMEDY 5175
HEALTHY LIVING 6694
TRAVEL 9887


In [17]:
import random
random.shuffle(training_docs)
random.shuffle(testing_docs)
print("Training data", len(training_docs))
print("Testing data",len(testing_docs))

Training data 29656
Testing data 3298


In [19]:
print('category', 'accuracy', 'precision', 'recall', 'F1_Score', sep='\t')

for term, stem, sim, weight in itertools.product(*permutations):

    processed_traindocs, processed_devdocs = process_docs(training_docs, testing_docs, stem)
    doc_freqs_train = compute_doc_freqs(processed_traindocs)
    doc_freqs_test = compute_doc_freqs(processed_devdocs)
    N_train = len(processed_traindocs)
    N_test = len(processed_devdocs)
    metrics = []

    # create Vprofile
    vprofile = defaultdict(lambda: defaultdict(float))
    train_vector_sense = defaultdict(list)

    for train_doc in processed_traindocs:
        train_vector = term_funcs[term](train_doc, doc_freqs_train, region_weights[weight], N_train)
        train_vector_sense[train_doc.category].append(train_vector)

    # do centriod
    for category, vecs in train_vector_sense.items():
        for doc in vecs:
            for word in doc:
                vprofile[category][word] += doc[word]

    for category, vec in vprofile.items():
        for word in vec:
            vec[word] /= len(vec)

    # calculate correctness
    correct_count = 0
    sims = defaultdict(float)
    tp = defaultdict(float)
    fn = defaultdict(float)
    fp = defaultdict(float)

    for test_doc in processed_devdocs:
        test_vector = term_funcs[term](test_doc, doc_freqs_test, region_weights[weight], N_test)
        for category, vec in vprofile.items():
            sims[category] = sim_funcs[sim](test_vector, vec)

        predict = max(sims, key=lambda key: sims[key])
        if predict == test_doc.category:
            correct_count += 1
            tp[predict] += 1
        else:
            fp[predict] += 1
            fn[test_doc.category] += 1

        metrics.append([
            test_doc.doc_id, predict, test_doc.category
        ])

    total_acc = correct_count / len(processed_devdocs)
    accuracy = defaultdict(float)
    precison = defaultdict(float)
    recall = defaultdict(float)
    f1_score = defaultdict(float)

    for category in selected_categories:
        accuracy[category] = tp[category] / (tp[category] + fp[category]) if (tp[category] + fp[category]) != 0 else 0
        precison[category] = tp[category] / (tp[category] + fn[category]) if (tp[category] + fn[category]) != 0 else 0
        recall[category] = (tp[category] + N_test - fn[category]) / (tp[category] + fp[category] + N_test)
        f1_score[category] = 2 * precison[category] * recall[category] / (precison[category] + recall[category])
        print(category, accuracy[category], precison[category], recall[category], f1_score[category], sep='\t')

    print(term, stem, sim, weight, str(total_acc))

    break

category	accuracy	precision	recall	F1_Score
QUEER VOICES	0.8155172413793104	0.7484177215189873	0.9314079422382672	0.8299458985231759
SPORTS	0.8107569721115537	0.8323108384458078	0.953421052631579	0.8887590345143084
COMEDY	0.7757731958762887	0.581081081081081	0.9175257731958762	0.7115366737938087
HEALTHY LIVING	0.7395411605937922	0.817910447761194	0.9220103986135182	0.8668462586004912
TRAVEL	0.766329346826127	0.8422649140546006	0.9064994298745724	0.873202460976954
tfidf False cosine 0 0.7768344451182535
