In [1]:
import pandas as pd
import numpy as np
# from read_data import train_val_split

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

In [2]:
def train_val_split(labels, n_labeled_per_class, unlabeled_per_class, n_labels, seed=0):
    """Split the original training set into labeled training set, unlabeled training set, development set

    Arguments:
        labels {list} -- List of labeles for original training set
        n_labeled_per_class {int} -- Number of labeled data per class
        unlabeled_per_class {int} -- Number of unlabeled data per class
        n_labels {int} -- The number of classes

    Keyword Arguments:
        seed {int} -- [random seed of np.shuffle] (default: {0})

    Returns:
        [list] -- idx for labeled training set, unlabeled training set, development set
    """
    np.random.seed(seed)
    labels = np.array(labels)
    train_labeled_idxs = []
    train_unlabeled_idxs = []
    val_idxs = []

    for i in range(n_labels):
        idxs = np.where(labels == i)[0]
        np.random.shuffle(idxs)
        if n_labels == 2:
            # IMDB
            train_pool = np.concatenate((idxs[:500], idxs[5500:-2000]))
            train_labeled_idxs.extend(train_pool[:n_labeled_per_class])
            train_unlabeled_idxs.extend(
                idxs[500: 500 + 5000])
            val_idxs.extend(idxs[-2000:])
        elif n_labels == 10:
            # DBPedia
            train_pool = np.concatenate((idxs[:500], idxs[10500:-2000]))
            train_labeled_idxs.extend(train_pool[:n_labeled_per_class])
            train_unlabeled_idxs.extend(
                idxs[500: 500 + unlabeled_per_class])
            val_idxs.extend(idxs[-2000:])
        else:
            # Yahoo/AG News
            train_pool = np.concatenate((idxs[:500], idxs[5500:-2000]))
            train_labeled_idxs.extend(train_pool[:n_labeled_per_class])
            train_unlabeled_idxs.extend(
                idxs[500: 500 + 5000])
            val_idxs.extend(idxs[-2000:])
    np.random.shuffle(train_labeled_idxs)
    np.random.shuffle(train_unlabeled_idxs)
    np.random.shuffle(val_idxs)

    return train_labeled_idxs, train_unlabeled_idxs, val_idxs


In [5]:
train_df =  pd.read_csv("/Users/pushkar_bhuse/MixText/MixText-LongTail/data/yahoo_answers_csv/train.csv", header=None)

In [6]:
train_df.head()

Unnamed: 0,0,1,2
0,5,5,why doesn't an optical mouse work on a glass t...
1,6,6,what is the best off-road motorcycle trail ? l...
2,3,3,what is trans fat? how to reduce that? i heard...
3,7,7,how many planes fedex has? i heard that it is ...
4,7,7,"in the san francisco bay area, does it make se..."


In [7]:
train_labels = train_df[0].apply(lambda x: int(x)-1).to_numpy()
train_text = train_df[2].to_numpy()

n_labels = max(train_labels) + 1

In [8]:
# Training the TF-IDF model on 2500 labelled data per class and testing on 5000 unlabelled data per class (total 10 classes)
unlabeled_per_class = 5000
n_labeled_per_class = 2500

train_labeled_idxs, train_unlabeled_idxs, val_idxs = train_val_split(
        train_labels, n_labeled_per_class, unlabeled_per_class, n_labels)

train, labels = train_text[train_labeled_idxs], train_labels[train_labeled_idxs]

unlabelled = train_text[train_unlabeled_idxs]

In [10]:
val_df = pd.DataFrame(data = unlabelled, columns = ['text'])
train_df = pd.DataFrame(data = train, columns = ['text'])

In [12]:
val_df.to_csv("val_data_gpt.csv")
train_df.to_csv("train_data_gpt.csv")

In [None]:
# Training TF-IDF model
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tfidf_wm = tfidfvectorizer.fit_transform(train)

# Fetching features
tfidf_tokens = tfidfvectorizer.get_feature_names()

# Getting TF-IDF scores on unlabelled data
unlabelled_tfidf = tfidfvectorizer.transform(unlabelled)

In [10]:
unlabelled_text = pd.DataFrame(data = unlabelled, columns=['text'])
unlabelled_text.head()

Unnamed: 0,text
0,who won the sequatchie valley basketball tourn...
1,what song is better? u have to answer the ques...
2,why are the eastern conference finals always c...
3,is reais the currancy of brazil? when you spe...
4,i want to know the scheme of isc board examina...


In [13]:
unlabelled_tfidf_df = pd.DataFrame(data = unlabelled_tfidf.toarray(), columns = tfidf_tokens)
unlabelled_tfidf_df.shape

(50000, 74108)

In [19]:
# Calculating mean of TF-IDF scores for each sequence 
mean_vals = np.mean(unlabelled_tfidf_df.to_numpy(), axis = 1)
mean_df = pd.DataFrame(data = mean_vals, columns = ['tfidf_mean'])
mean_df.head()

Unnamed: 0,tfidf_mean
0,4.1e-05
1,7e-05
2,8.6e-05
3,4.5e-05
4,8.6e-05


In [None]:
# Sorting based on mean TF-IDF scores
result_df = pd.concat([unlabelled_text, mean_df], axis = 1)
sorted_df = result_df.sort_values(by ='tfidf_mean' )
sorted_df.head(20)