In [231]:
import json
import numpy as np
import random

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer


In [232]:
dataset = 'imdb'

In [233]:

data_dir = Path(f"../../data/datasets/public/sequence_classification/{dataset}/")
data = json.load(open(data_dir / 'train.json', 'r'))
random.shuffle(data)
for x in data:
    x['content'] = x['content'].lower()

In [234]:
data_neg = [x for x in data if x['label']=='negative']
data_pos = [x for x in data if x['label']=='positive']

In [237]:
data_train = data_neg[:20] + data_pos[:20]
data_unlabeled = data_neg[20:] + data_pos[20:]
random.shuffle(data_train)
random.shuffle(data_unlabeled)

corpus = []
for x in data_unlabeled:
    corpus.append(x['content'])

In [259]:
class TfidfDA:
    def __init__(self, corpus, p=0.5):
        self.p = p
        self.vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
        self.corpus = corpus
        self.X = self.vectorizer.fit_transform(
            corpus,
        )
        self._idx_to_word = self.vectorizer.get_feature_names()
        
        self.word_freq = {}
        for i in range(self.X.shape[0]):
            for j in self.X[i, :].indices:
                if self.word_freq.get(j):
                    self.word_freq[j] += 1
                else:
                    self.word_freq[j] = 1
        self.S = []
        for i in range(self.X.shape[0]):
            self.S.append(self.word_freq[i] * self.get_idf(i))

        self.S = np.array(self.S)
        self.S = self.S.max() - self.S
        self.S = self.S / self.S.sum()

    def idx_to_word(self, idx):
        return self._idx_to_word[idx]
    
    def word_to_idx(self, w):
        return self.vectorizer.vocabulary_[w]
    
    def get_idf(self, idx):
        return self.vectorizer.idf_[idx]
    
    def get_replacement_word(self):
        return np.random.choice(
            a=range(self.X.shape[0]), 
            p=self.S
        )
    
    def transform(self):
        
        data = []
        for i in range(self.X.shape[0]):
            r = self.X[i, :]
            x = r.data
            z = x.max() - x
            z = self.p * z / (z.mean())
            z = np.minimum(z, 1)
            
            idx_to_idx = dict()
            for j, replace in enumerate(z >= np.random.rand(len(z))):
                if replace:
                    w = self.get_replacement_word()
                    idx_to_idx[r.indices[j]] = w
                    
            sent = []
            # print(idx_to_idx)
            for m in re.finditer(self.vectorizer.token_pattern, self.corpus[i]):
                w = m.group()
                idx = self.word_to_idx(w)
                
                if idx_to_idx.get(idx):
                    idx = idx_to_idx[idx]
                    w = self.idx_to_word(idx)
                sent.append(w)
                    
            data.append(
                " ".join(sent)
            )
        return data
        


In [260]:
p = 0.08

In [261]:
da = TfidfDA(corpus, p=p)
data_da = da.transform()
p_str = str(p).replace('.', '')
for i in range(len(data_unlabeled)):
    data_unlabeled[i][f'content_da'] = data_da[i]

In [262]:
json.dump(data_train, open(data_dir / 'train_uda.json', 'w'))
json.dump(data_unlabeled, open(data_dir / 'unlabeled_uda.json', 'w'))

In [265]:
len(data_train)

40