# 20NG, R52, R8
@Misc{2007:phd-Ana-Cardoso-Cachopo,

  author = {Ana Cardoso-Cachopo},

  title = {{Improving Methods for Single-label Text Categorization}},

  howpublished = {PdD Thesis, Instituto Superior Tecnico, Universidade Tecnica de Lisboa},

  year = 2007} 
  
  https://ana.cachopo.org/datasets-for-single-label-text-categorization
  
  **Pre-processing
Except for the Cade12 dataset, from the original datasets, in order to obtain the present files, I applied the following pre-processing:**

* Substitute TAB, NEWLINE and RETURN characters by SPACE.

* Keep only letters (that is, turn punctuation, numbers, etc. into SPACES).

* Turn all letters to lowercase.

* Substitute multiple SPACES by a single SPACE.

* The title/subject of each document is simply added in the beginning of the document's text.

* no-short Obtained from the previous file, by removing words that are less than 3 characters long. For example, removing "he" but keeping "him".

* no-stop Obtained from the previous file, by removing the 524 SMART stopwords. Some of them had already been removed, because they were shorter than 3 characters.

* stemmed Obtained from the previous file, by applying Porter's Stemmer to the remaining words. Information about stemming can be found here.
  

# CLINC150

from their github
https://github.com/clinc/oos-eval

# Libraries

In [1]:
import json
import random
import os
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from __future__ import print_function
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from numpy import array, asarray, zeros
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from tqdm.notebook import tqdm
import string

# Seeds
np.random.seed(4)

# Pre-processing text

In [2]:
# Preprocessing text
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')
porter = PorterStemmer()

def prepro_text(text):
    text = text.lower()
    text_p = "".join([char for char in text if char not in string.punctuation]) 
    words = nltk.word_tokenize(text_p)
    filtered_words = [word for word in words if word not in stop_words]
    stemmed = [porter.stem(word) for word in filtered_words]
    return ' '.join(stemmed)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Camilo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Camilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Create Data Frames

In [3]:
path_data = '\\Users\\Camilo\\Desktop\\tesis\\codes\\data\\'
# ------------------  r52 DATASET -----------------------

# Train-Validation examples
print('R52 DATASET')
X = list()
y = list()
group = list()

# Train data
f = open(path_data+'r52-train-all-terms.txt')
data = list()
for line in f:
    X.append(' '.join(line.split()[1:]))
    y.append(line.split()[0])
    group.append('train')
f.close()

# Test data
f = open(path_data+'r52-test-all-terms.txt')
labels = set()
for line in f:
    X.append(' '.join(line.split()[1:]))
    y.append(line.split()[0])
    group.append('test')
f.close()

# Create DataFrame
df_r = pd.DataFrame(list(zip(X, y)), 
              columns =['text_raw', 'label']) 
df_r['group'] = group
df_r['text'] = df_r['text_raw'].apply(lambda x: prepro_text(x))
#df_r['large'] = df_r['text'].apply(lambda x: len(x.split()))

# Label to number class
le = LabelEncoder()
le.fit(df_r['label'].unique())
df_r['class'] = df_r['label'].apply(lambda x: le.transform([x])[0])
np.save(path_data+'r52-classesEncoder.npy', le.classes_)

print(df_r.sample(5))
print('------------------------------------------------')

# ------------------  20NG DATASET -----------------------
print('20NG DATASET')
X = list()
y = list()
group = list()

# Train data
f = open(path_data+'20ng-train-all-terms.txt')
data = list()
for line in f:
    X.append(' '.join(line.split()[1:]))
    y.append(line.split()[0])
    group.append('test')
f.close()

# Train-Val
for lab, text in data:
    X.append(' '.join(text))
    y.append(lab)
    group.append('train')

# Test data
f = open(path_data+'20ng-test-all-terms.txt')
for line in f:
    X.append(' '.join(line.split()[1:]))
    y.append(line.split()[0])
    group.append('test')
f.close()

# Create DataFrame
df_ng = pd.DataFrame(list(zip(X, y)), 
              columns =['text_raw', 'label']) 
df_ng['group'] = group
df_ng['text'] = df_ng['text_raw'].apply(lambda x: prepro_text(x))
#df_ng['large'] = df_ng['text'].apply(lambda x: len(x.split()))

# Label to number class
le = LabelEncoder()
le.fit(df_ng['label'].unique())
df_ng['class'] = df_ng['label'].apply(lambda x: le.transform([x])[0])
np.save(path_data+'20ng-classesEncoder.npy', le.classes_)
    
print(df_ng.sample(5))
print('------------------------------------------------')

# ------------------  CLINC150 DATASET -----------------------
print('CLINC150 DATASET')
X = list()
y = list()
group = list()
f = open(path_data+'clinc150_data_full.json')
data = json.load(f) 
data.keys()
# Train
for text,label in data['train']:
    X.append(text)
    y.append(label)
    group.append('train')
# Test
for text,label in data['test']: 
    X.append(text)
    y.append(label)
    group.append('test')
# Validation
for text,label in data['val']: 
    X.append(text)
    y.append(label)    
    group.append('val')
f.close()

df_clinc = pd.DataFrame(list(zip(X, y)), 
              columns =['text_raw', 'label']) 
df_clinc['group'] = group
df_clinc['text'] = df_clinc['text_raw'].apply(lambda x: prepro_text(x))
#df_clinc['large'] = df_clinc['text'].apply(lambda x: len(x.split()))

# Label to number class
le = LabelEncoder()
le.fit(df_clinc['label'].unique())
df_clinc['class'] = df_clinc['label'].apply(lambda x: le.transform([x])[0])
np.save(path_data+'clinc150-classesEncoder.npy', le.classes_)

print(df_clinc.sample(5))
print('------------------------------------------------')

# ------------------  TREC DATASET -----------------------
print('TREC DATASET')

X = list()
y = list()
group = list()

f = open(path_data +'trec6-train.txt')
for line in f:
    line = line.replace('?','')
    X.append(line.split(':')[1].strip())
    y.append(line.split(':')[0])
    group.append('train')
f.close()

f = open(path_data +'trec6-test.txt')
for line in f:
    line = line.replace('?','')
    X.append(line.split(':')[1].strip())
    y.append(line.split(':')[0])
    group.append('test')
f.close()

df_trec = pd.DataFrame(list(zip(X, y)), 
              columns =['text_raw', 'label']) 
df_trec['group'] = group
df_trec['text'] = df_trec['text_raw'].apply(lambda x: prepro_text(x))
#df_trec['large'] = df_trec['text'].apply(lambda x: len(x.split()))

# Label to number class
le = LabelEncoder()
le.fit(df_trec['label'].unique())
df_trec['class'] = df_trec['label'].apply(lambda x: le.transform([x])[0])
np.save(path_data+'trec6-classesEncoder.npy', le.classes_)

print(df_trec.head())

R52 DATASET
                                               text_raw label  group  \
5128  borg warner up amid rumors irwin jacobs sold s...   acq  train   
4171  ambrit inc abi th qtr jan net shr cts vs nil n...  earn  train   
2765  japan foreign shipbuilding orders rise in febr...  ship  train   
3632  group has pct of atlantic research atrc a grou...   acq  train   
6210  agency reports ships waiting at panama canal t...  ship  train   

                                                   text  class  
5128  borg warner amid rumor irwin jacob sold stock ...      0  
4171  ambrit inc abi th qtr jan net shr ct vs nil ne...     12  
2765  japan foreign shipbuild order rise februari ne...     43  
3632  group pct atlant research atrc group led halcy...      0  
6210  agenc report ship wait panama canal panama can...     43  
------------------------------------------------
20NG DATASET
                                                text_raw  \
2460   re can i get more than x on monitor 

# Create validation split in train set

Datasets:
 * 20NG
 * R52
 * TREC6

In [4]:
def create_val(df,p=0.2):
    frames = list()
    labels = df['class'].unique()
    for c in labels:
        df_aux = df[df['class'] == c].copy()
        arr = df_aux.group.values
        l = len(df_aux[df_aux.group == 'train']['group'])*p
        c = 0
        ind = np.random.randint(0,len(arr))
        while (c < l):
            if (arr[ind] == 'train'):
                arr[ind] = 'val'
                c += 1
            ind = np.random.randint(0,len(arr))
        df_aux['group'] = arr
        frames.append(df_aux)
    return pd.concat(frames)

# Dataframe to CSV

In [5]:
df = create_val(df_trec) 
df.to_csv(path_or_buf=path_data+'trecData.csv', sep='@',index=False)

df = create_val(df_clinc) 
df.to_csv(path_or_buf=path_data+'clincData.csv', sep='@',index=False)

df = create_val(df_ng) 
df.to_csv(path_or_buf=path_data+'ngData.csv', sep='@',index=False)

df = create_val(df_r) 
df.to_csv(path_or_buf=path_data+'rData.csv', sep='@',index=False)

In [6]:
d = pd.read_csv(path_data+'trecData.csv', delimiter='@')

# Load encoder

In [7]:
#encoder = LabelEncoder()
#encoder.classes_ = np.load(path_data+'classes.npy',allow_pickle=True)