In [1]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pickle
import json
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
class word2vec:
    def __init__(self, X, tokenizer, vector_size=100, window=5, min_count=2, sg=1, sample=1e-3, workers=4):
        self.X_tokenized = [tokenizer(doc.lower()) for doc in X]
        self.model = Word2Vec(sentences=self.X_tokenized, vector_size=vector_size, window=window, min_count=min_count, workers=workers,sg=sg, sample=sample)
        
    def classify(self, seed_words):
        label_representations = {}
        for label, seeds in seed_words.items():
            label_representations[label] = np.mean([self.model.wv[word] for word in seeds if word in self.model.wv], axis=0)

        document_labels = []

        for doc in self.X_tokenized:
            doc_representation = np.mean([self.model.wv[word] for word in doc if word in self.model.wv], axis=0)
            similarities = {label: cosine_similarity([doc_representation], [label_vec])[0][0] for label, label_vec in label_representations.items()}
            document_labels.append(max(similarities, key=similarities.get))
        
        return document_labels

## Word2Vec on 20 Newsgroup Dataset (Coarse)

In [3]:
# 20 Newsgroup Dataset (Coarse)
with open("data/20news/coarse/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/20news/coarse/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [4]:
X = data["sentence"]
y = data["label"]

In [5]:
tokenizer = word_tokenize
classifier = word2vec(X, tokenizer, vector_size=100, window=10, min_count=2)

In [6]:
pred = classifier.classify(seedwords)

In [7]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) using without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) using without hyperparameter tuning: {f1_micro}")

F1_score (macro) using without hyperparameter tuning: 0.37498671637034997
F1_score (micro) using without hyperparameter tuning: 0.5533161728462676


## Word2Vec on 20 Newsgroup Dataset (Fine)

In [11]:
# 20 Newsgroup Dataset (Fine)
with open("data/20news/fine/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/20news/fine/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [12]:
X = data["sentence"]
y = data["label"]

In [8]:
tokenizer = word_tokenize
classifier = word2vec(X, tokenizer, vector_size=300, window=15, min_count=2)

In [9]:
pred = classifier.classify(seedwords)

In [10]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) using without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) using without hyperparameter tuning: {f1_micro}")

F1_score (macro) using without hyperparameter tuning: 0.37203862430346085
F1_score (micro) using without hyperparameter tuning: 0.48589736568267705


## Word2Vec on NYT Dataset (Coarse)

In [30]:
# NYT Dataset (Coarse)
with open("data/nyt/coarse/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/nyt/coarse/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [31]:
X = data["sentence"]
y = data["label"]

In [50]:
tokenizer = word_tokenize
classifier = word2vec(X, tokenizer, vector_size=350, window=20, min_count=3, workers=15, sg=0)

In [51]:
pred = classifier.classify(seedwords)

In [52]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) using without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) using without hyperparameter tuning: {f1_micro}")

F1_score (macro) using without hyperparameter tuning: 0.6779590738069426
F1_score (micro) using without hyperparameter tuning: 0.903704346317342


## IR-TF-IDF on NYT Dataset (Fine)


In [3]:
# NYT Dataset (Fine)
with open("data/nyt/fine/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/nyt/fine/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [4]:
X = data["sentence"]
y = data["label"]

In [9]:
tokenizer = word_tokenize
classifier = word2vec(X, tokenizer, vector_size=350, window=20, min_count=3, workers=15, sg=0)

In [10]:
pred = classifier.classify(seedwords)

In [11]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) using without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) using without hyperparameter tuning: {f1_micro}")

F1_score (macro) using without hyperparameter tuning: 0.4610761956145217
F1_score (micro) using without hyperparameter tuning: 0.6508198143489199
