In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import json
from collections import defaultdict
import numpy as np
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.util import ngrams
import re

In [2]:
def custom_tokenizer(text):
    words = text.split()
    tokens = []
    tokens.extend(words)
    for word in words:
        for seed in all_seed_words:
            if seed in word:
                tokens.append(seed)
    return tokens

In [3]:
class IRTFIDF:
    def __init__(self, X, vectorizer):
        self.X = X
        self.N = len(X)
        self.vectorizer = vectorizer
        self.X_tfidf = vectorizer.fit_transform(X)
    
    def compute_tfidf(self, X_idx, words):
        score = 0
        for w in words:
            if w not in self.vectorizer.vocabulary_:
                score += 0
            else:
                word_index = self.vectorizer.vocabulary_[w]
                score += self.X_tfidf[X_idx, word_index]
        return score
        
    def classify(self, seedwords):
        documents_labels = []
        for i in range(self.X.shape[0]):
            scores = {}
            for label, words in seedwords.items():
                all_words = words + [label]
                scores[label] = self.compute_tfidf(i, all_words)
            if sum(scores.values()) == 0:
                documents_labels.append("sci")
            else:
                documents_labels.append(max(scores, key=scores.get))
        return documents_labels

## IR-TF-IDF on 20 Newsgroup Dataset (Coarse)

In [4]:
# 20 Newsgroup Dataset (Coarse)
with open("data/20news/coarse/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/20news/coarse/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [5]:
X = data["sentence"]
y = data["label"]

In [6]:
vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)

In [7]:
pred = classifier.classify(seedwords)

In [8]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) using without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) using without hyperparameter tuning: {f1_micro}")

F1_score (macro) using without hyperparameter tuning: 0.55792668405245
F1_score (micro) using without hyperparameter tuning: 0.5477298866312503


In [9]:
all_seed_words = [item for sublist in [[key] + values for key, values in seedwords.items()] for item in sublist]

In [10]:
# Using customer tokenizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)



In [11]:
pred = classifier.classify(seedwords)

In [12]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) with hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) with hyperparameter tuning: {f1_micro}")

F1_score (macro) with hyperparameter tuning: 0.5210030142113452
F1_score (micro) with hyperparameter tuning: 0.5395695273563722


## IR-TF-IDF on 20 Newsgroup Dataset (Fine)

In [13]:
# 20 Newsgroup Dataset (Fine)
with open("data/20news/fine/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/20news/fine/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [14]:
X = data["sentence"]
y = data["label"]

In [15]:
vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)

In [16]:
pred = classifier.classify(seedwords)

In [17]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) without hyperparameter tuning: {f1_micro}")

F1_score (macro) without hyperparameter tuning: 0.5225877902426144
F1_score (micro) without hyperparameter tuning: 0.4753272358836738


In [18]:
all_seed_words = [item for sublist in [[key] + values for key, values in seedwords.items()] for item in sublist]

In [19]:
# Using custom tokenizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)



In [20]:
pred = classifier.classify(seedwords)

In [21]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) with hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) with hyperparameter tuning: {f1_micro}")

F1_score (macro) with hyperparameter tuning: 0.5017943285513188
F1_score (micro) with hyperparameter tuning: 0.4982748233747741


## IR-TF-IDF on NYT Dataset (Coarse)

In [22]:
# NYT Dataset (Coarse)
with open("data/nyt/coarse/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/nyt/coarse/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [23]:
X = data["sentence"]
y = data["label"]

In [24]:
vectorizer = TfidfVectorizer(stop_words="english")
classifier = IRTFIDF(X, vectorizer)

In [25]:
pred = classifier.classify(seedwords)

In [26]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) without hyperparameter tuning: {f1_micro}")

F1_score (macro) without hyperparameter tuning: 0.48540181226284024
F1_score (micro) without hyperparameter tuning: 0.6397154506810098


In [27]:
all_seed_words = [item for sublist in [[key] + values for key, values in seedwords.items()] for item in sublist]

In [28]:
# Using customer tokenizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)



In [29]:
pred = classifier.classify(seedwords)

In [30]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) with hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) with hyperparameter tuning: {f1_micro}")

F1_score (macro) with hyperparameter tuning: 0.4362725704218248
F1_score (micro) with hyperparameter tuning: 0.6267892773488332


## IR-TF-IDF on NYT Dataset (Fine)

In [31]:
# NYT Dataset (Fine)
with open("data/nyt/fine/df.pkl", "rb") as file:
    data = pickle.load(file)
with open("data/nyt/fine/seedwords.json", "rb") as file:
    seedwords = json.load(file)

In [32]:
X = data["sentence"]
y = data["label"]

In [33]:
vectorizer = TfidfVectorizer(stop_words="english")
classifier = IRTFIDF(X, vectorizer)

In [34]:
pred = classifier.classify(seedwords)

In [35]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) without hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) without hyperparameter tuning: {f1_micro}")

F1_score (macro) without hyperparameter tuning: 0.5696809813758315
F1_score (micro) without hyperparameter tuning: 0.5180012145397762


In [36]:
all_seed_words = [item for sublist in [[key] + values for key, values in seedwords.items()] for item in sublist]

In [37]:
#Using custom tokenizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, sublinear_tf=True)
classifier = IRTFIDF(X, vectorizer)



In [38]:
pred = classifier.classify(seedwords)

In [39]:
f1_macro = f1_score(y,pred,average='macro')
f1_micro = f1_score(y,pred,average='micro')
print(f"F1_score (macro) with hyperparameter tuning: {f1_macro}")
print(f"F1_score (micro) with hyperparameter tuning: {f1_micro}")

F1_score (macro) with hyperparameter tuning: 0.5342748585120182
F1_score (micro) with hyperparameter tuning: 0.5224256094387091
