In [1]:
import nltk
import os
import pandas as pd

## Table dimensions

In [2]:
TABLE_DIR = './data/tables'

table_shapes = dict()

for file in os.listdir(TABLE_DIR):
    if not file.endswith('.tsv'):
        continue
    
    table_shapes[file] = pd.read_csv(os.path.join(TABLE_DIR, file), sep='\t').shape

In [3]:
min_rows = min(shape[0] for shape in table_shapes.values())
max_rows = max(shape[0] for shape in table_shapes.values())
avg_rows = sum(shape[0] for shape in table_shapes.values()) / len(table_shapes)

min_rows, max_rows, avg_rows

(7, 62, 33.28214285714286)

In [4]:
min_cols = min(shape[1] for shape in table_shapes.values())
max_cols = max(shape[1] for shape in table_shapes.values())
avg_cols = sum(shape[1] for shape in table_shapes.values()) / len(table_shapes)

min_cols, max_cols, avg_cols

(7, 9, 7.864285714285714)

In [5]:
min_cols = min(shape[0]*shape[1] for shape in table_shapes.values())
max_cols = max(shape[0]*shape[1] for shape in table_shapes.values())
avg_cols = sum(shape[0]*shape[1] for shape in table_shapes.values()) / len(table_shapes)

min_cols, max_cols, avg_cols

(49, 496, 265.2785714285714)

## Document lengths

In [6]:
TEXTS_DIR = './data/texts'

text_lengths = dict()

for file in os.listdir(TEXTS_DIR):
    if not file.endswith('.txt'):
        continue
    
    lines = ''.join([line for line in open(os.path.join(TEXTS_DIR, file), 'r')])
    text_lengths[file] = len(nltk.word_tokenize(lines))

In [7]:
min_len = min(text_lengths.values())
max_len = max(text_lengths.values())
avg_len = sum(text_lengths.values()) / len(text_lengths)

min_len, max_len, avg_len

(63, 643, 249.75)

## Tokens and Types

In [8]:
with open('train.src') as f:
  train_src = f.readlines()
with open('train.trg') as f:
  train_trg = f.readlines()

train_src = list(map(lambda s: s.strip(), train_src))
train_trg = list(map(lambda s: s.strip(), train_trg))

In [9]:
with open('valid.src') as f:
  valid_src = f.readlines()
with open('valid.trg') as f:
  valid_trg = f.readlines()

valid_src = list(map(lambda s: s.strip(), valid_src))
valid_trg = list(map(lambda s: s.strip(), valid_trg))

In [10]:
with open('test.src') as f:
  test_src = f.readlines()
with open('test.trg') as f:
  test_trg = f.readlines()

test_src = list(map(lambda s: s.strip(), test_src))
test_trg = list(map(lambda s: s.strip(), test_trg))

In [11]:
len(train_src), len(train_trg), len(valid_src), len(valid_trg), len(test_src), len(test_trg)

(210, 210, 21, 21, 49, 49)

In [12]:
train_tokens = []
train_lengths = []

for text in train_trg:
    tokens = nltk.word_tokenize(text.lower())

    train_tokens.extend(tokens)
    train_lengths.append(len(tokens))
    
len(train_tokens), len(set(train_tokens)), sum(train_lengths) / len(train_lengths)

(56250, 3711, 267.85714285714283)

In [13]:
valid_tokens = []
valid_lengths = []

for text in valid_trg:
    tokens = nltk.word_tokenize(text.lower())

    valid_tokens.extend(tokens)
    valid_lengths.append(len(tokens))
    
len(valid_tokens), len(set(valid_tokens)), sum(valid_lengths) / len(valid_lengths)

(4103, 478, 195.38095238095238)

In [14]:
test_tokens = []
test_lengths = []

for text in test_trg:
    tokens = nltk.word_tokenize(text.lower())

    test_tokens.extend(tokens)
    test_lengths.append(len(tokens))
    
len(test_tokens), len(set(test_tokens)), sum(test_lengths) / len(test_lengths)

(9555, 869, 195.0)

## Cleaned Texts

In [15]:
with open('train.src') as f:
  train_src = f.readlines()
with open('train-clean.trg') as f:
  train_trg = f.readlines()

train_src = list(map(lambda s: s.strip(), train_src))
train_trg = list(map(lambda s: s.strip(), train_trg))

In [16]:
with open('valid.src') as f:
  valid_src = f.readlines()
with open('valid-clean.trg') as f:
  valid_trg = f.readlines()

valid_src = list(map(lambda s: s.strip(), valid_src))
valid_trg = list(map(lambda s: s.strip(), valid_trg))

In [17]:
with open('test.src') as f:
  test_src = f.readlines()
with open('test-clean.trg') as f:
  test_trg = f.readlines()

test_src = list(map(lambda s: s.strip(), test_src))
test_trg = list(map(lambda s: s.strip(), test_trg))

In [18]:
len(train_src), len(train_trg), len(valid_src), len(valid_trg), len(test_src), len(test_trg)

(210, 210, 21, 21, 49, 49)

In [19]:
train_tokens = []
train_lengths = []

for text in train_trg:
    tokens = nltk.word_tokenize(text.lower())

    train_tokens.extend(tokens)
    train_lengths.append(len(tokens))
    
len(train_tokens), len(set(train_tokens)), sum(train_lengths) / len(train_lengths)

(43526, 2243, 207.26666666666668)

In [20]:
valid_tokens = []
valid_lengths = []

for text in valid_trg:
    tokens = nltk.word_tokenize(text.lower())

    valid_tokens.extend(tokens)
    valid_lengths.append(len(tokens))
    
len(valid_tokens), len(set(valid_tokens)), sum(valid_lengths) / len(valid_lengths)

(4091, 476, 194.8095238095238)

In [21]:
test_tokens = []
test_lengths = []

for text in test_trg:
    tokens = nltk.word_tokenize(text.lower())

    test_tokens.extend(tokens)
    test_lengths.append(len(tokens))
    
len(test_tokens), len(set(test_tokens)), sum(test_lengths) / len(test_lengths)

(9506, 869, 194.0)