In [1]:
import re
import os
import numpy as np
import pandas as pd

#### Parse 20ng
---

In [2]:
df = pd.read_csv("data/20ng.txt", sep="\t", header=None)
df.columns = ["path", "type", "label"]

In [3]:
df["text"] = None

In [4]:
df.head()

Unnamed: 0,path,type,label,text
0,data/20ng/20news-bydate-test/alt.atheism/53068,20news-bydate-test,alt.atheism,
1,data/20ng/20news-bydate-test/alt.atheism/53257,20news-bydate-test,alt.atheism,
2,data/20ng/20news-bydate-test/alt.atheism/53260,20news-bydate-test,alt.atheism,
3,data/20ng/20news-bydate-test/alt.atheism/53261,20news-bydate-test,alt.atheism,
4,data/20ng/20news-bydate-test/alt.atheism/53262,20news-bydate-test,alt.atheism,


In [19]:
def clean_fn(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def read_clean_doc(path):
    with open(df.loc[i, "path"], "rb") as f:
        lines = f.readlines()
    lines = map(lambda x: x.decode('latin-1'), lines)
    lines = " ".join(lines)
    return clean_fn(lines)
    

In [None]:
import time
start = time.time()
for i, row in df.iterrows():
    df.loc[i, "text"] = read_clean_doc(df.loc[i, "path"])
print("time taken: ", time.time()-start)

In [7]:
df.head()

Unnamed: 0,path,type,label,text
0,data/20ng/20news-bydate-test/alt.atheism/53068,20news-bydate-test,alt.atheism,from decay cbnewsj cb att com \( dean kaflowit...
1,data/20ng/20news-bydate-test/alt.atheism/53257,20news-bydate-test,alt.atheism,from cfaehl vesta unm edu \( chris faehl \) su...
2,data/20ng/20news-bydate-test/alt.atheism/53260,20news-bydate-test,alt.atheism,from mathew mathew mantis co uk subject re yet...
3,data/20ng/20news-bydate-test/alt.atheism/53261,20news-bydate-test,alt.atheism,"from dps nasa kodak com \( dan schaertel , , ,..."
4,data/20ng/20news-bydate-test/alt.atheism/53262,20news-bydate-test,alt.atheism,from halat panther bears \( jim halat \) subje...


In [91]:
train_df = df[df.type.str.contains("test")].copy()
test_df = df[df.type.str.contains("train")].copy()

train_df['train_mask'] = True
train_df['test_mask'] = False
test_df['test_mask'] = True
test_df['train_mask'] = False

all_df = pd.concat([train_df, test_df], axis=0)
all_df.reset_index(drop=True, inplace=True)
all_df.reset_index(inplace=True)
all_df.columns = ["doc_id"] + list(all_df.columns)[1:]
all_df.doc_id = all_df.doc_id.astype(str)
all_df.doc_id = 'doc_id_' + all_df.doc_id

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [92]:
import itertools
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 

class Vocab(object):
  
  def __init__(self, L, doc_ids=None, remove_stop_words=False, min_counts=1):    
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.token_counts.columns = ["counts"]
      if remove_stop_words:
        self.stop_words = set(stopwords.words('english')) 
        idx = self.token_counts.index.isin(self.stop_words)
        self.token_counts = self.token_counts[~idx]
      self.token_counts = self.token_counts[self.token_counts.counts >= min_counts]
        
      self.vocab = ["<unk>"] + self.token_counts.index.to_list()
    else:
      raise Error

    self.vocab = list(set(self.vocab))
    self.vocab = sorted(self.vocab)
    if doc_ids is not None:
      self.vocab = doc_ids + self.vocab 
    
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['<unk>'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_words2unk(self, L):
    return map(lambda x: x if x in self.vocab else "<unk>", L)

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

  def map_dataset_words2unk(self, L):
    return np.array(list(map(self.map_words2unk, L)))

  def remove_sw_helper(self, L):
    return filter(lambda x: x not in self.stop_words, L)

  def remove_stop_words(self, L):
    return np.array(list(map(self.remove_sw_helper, L)))    

  def remove_sw_add_unk(self, L):
    L = self.remove_stop_words(L)
    L = self.map_dataset_words2unk(L)
    L = list(map(list, L))
    return L

  def get_counts(self):
    return self.token_counts

In [93]:
REMOVE_STOP_WORDS = True
MIN_COUNTS = 5
doc_ids = all_df.doc_id.values
texts = list(map(lambda x: x.split(), df.text.values))
vocab = Vocab(texts, doc_ids=doc_ids.tolist(),
              remove_stop_words=REMOVE_STOP_WORDS, min_counts=MIN_COUNTS)

def helper(s, vocab):
    L = s.split()
    L = filter(lambda x: x not in vocab.stop_words, L)
    L = map(lambda x: x if x in vocab.vocab else "<unk>", L)
    
    return " ".join(L)

%time all_df.text = all_df.text.apply(lambda s: helper(s, vocab))

CPU times: user 1h 3min 14s, sys: 139 ms, total: 1h 3min 14s
Wall time: 1h 3min 14s


In [95]:
all_df.head()

Unnamed: 0,doc_id,label,path,test_mask,text,train_mask,type
0,doc_id_0,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53068,False,decay cbnewsj cb att com \( dean kaflowitz \) ...,True,20news-bydate-test
1,doc_id_1,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53257,False,cfaehl vesta unm edu \( chris faehl \) subject...,True,20news-bydate-test
2,doc_id_2,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53260,False,mathew mathew mantis co uk subject yet rushdie...,True,20news-bydate-test
3,doc_id_3,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53261,False,"dps nasa kodak com \( dan schaertel , , , \) s...",True,20news-bydate-test
4,doc_id_4,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53262,False,halat panther bears \( jim halat \) subject 20...,True,20news-bydate-test


In [94]:
all_df.to_csv('data/20ng/raw/all_df_mask.tsv', index_label=False, sep='\t')