# Data Preparation
---

## Import Libraries

In [1]:
import os
import sys
from multiprocessing import Pool
import pandas as pd

## Global Config

In [2]:
data_dir = '../data/raw/interim/'
data = 'output-00000-of-00100'
out = data_dir+data+'_prepared'

In [3]:
nsplits = 24
ncores = 24

In [4]:
test_rows = None

## Load Data

In [5]:
raw_data = pd.read_csv(data_dir+data, nrows=test_rows,
                       header=None, sep='\t', quoting = 3,
                       names=['semiotic', 'before', 'after'])
raw_data.info()

FileNotFoundError: File b'../data/raw/interim/output-00000-of-00100' does not exist

## Generate Sentence IDs

In [15]:
def gen_sentenceid(df):
    # Generating sentence and word token ids
    # Our text normalization approach requires sentence and token ids to encode and generate batches
    final_df = pd.DataFrame(columns=['sentence_id',
                                     'token_id',
                                     'semiotic',
                                     'before',
                                     'after'])
    # initialize columns and iterator
    sentence_id = 0
    token_id = -1
    pid = str(os.getpid())
    fname = out+'_ID#'+pid
    # heavy processing ahead
    for row in df.itertuples():
        # look for end of sentences
        if row.semiotic == '<eos>':
            sentence_id += 1
            token_id = -1
            continue
        else:
            token_id += 1
    
        new_row = {'sentence_id': sentence_id,
                   'token_id': token_id,
                   'semiotic': row.semiotic,
                   'before': row.before,
                   'after': row.after}
        final_df = final_df.append(new_row, ignore_index=True)
        done_percent = 100*final_df.shape[0]/df.shape[0]
        print('Done: {:.2f}% \tSentence ID#{}'.format(done_percent,sentence_id))
        # save after every batch of 100 sentences
        if not sentence_id%100:
            print('SAVING UPTO SENTENCE ID#{}'.format(sentence_id))
            final_df.to_csv(fname+'_unprocessed', index=False)
            
    print('Done: 100%')
    print('Transforming after tokens')
    # **Transforming 'after' tokens**  
    # From the above mentioned paper:
    # ```
    # Semiotic class instances are verbalized as sequences
    # of fully spelled words, most ordinary words are left alone (rep-
    # resented here as <self>), and punctuation symbols are mostly
    # transduced to sil (for “silence”).
    # ```
    # Hence we transform as follows:
    # 1. sil is replaced with < self >
    # 2. < self > is replaced with the before column
    # 
    sil_mask = (final_df['after'] == 'sil')
    final_df.loc[sil_mask, 'after'] = '<self>' 
    self_mask = (final_df['after'] == '<self>')
    final_df.loc[self_mask, ('after')] = final_df.loc[self_mask, 'before']
    final_df.to_csv(fname, index=False)
    return final_df

In [16]:
def split_eos(df, num_splits):
    eos_rows = df[df['before'] == '<eos>']
    print('Number of sentences: {}'.format(eos_rows.shape[0]))
    total_sentences = eos_rows.shape[0]
    interval = total_sentences//num_splits
    split_marks=[eos_rows.iloc[i*interval-1].name for i in range(0,num_splits+1)]
    # start the split at 0 index
    # rather than at the first eos mark
    split_marks[0]=0
    df_splits = []
    for here, there in zip(split_marks, split_marks[1:]):
        df_splits.append(df.iloc[here:there])
    return df_splits

In [17]:
raw_splits = split_eos(raw_data, nsplits)

Number of sentences: 880672


### Launch Multi-Processes

In [None]:
pool = Pool(ncores)
final_results = pool.map(gen_sentenceid, raw_splits)

Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
Done: 0.01% 	Sentence ID#2
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#0
SAVING UPTO SENTENCE ID#0
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#2
Done: 0.00% 	Sentence ID#1
Done: 0.00% 	Sentence ID#2
Done: 0.00% 	Sentence ID#2
Done: 0.00% 	Sentence ID#2
Done: 0.00% 	Sentence ID#2
Done: 0.00% 	Sentence ID#2
Done: 0.00%

Done: 0.02% 	Sentence ID#7
Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#7
Done: 0.02% 	Sentence ID#7
Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#7
Done: 0.02% 	Sentence ID#7
Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#8
Done: 0.02% 	Sentence ID#7
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#7
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#8
Done: 0.02% 	Sentence ID#7
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#9
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#9
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#5
Done: 0.02% 	Sentence ID#9
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#6
Done: 0.02% 	Sentence ID#5
Done: 0.02% 	Sentence ID#9
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#6
Done: 0.02% 	Sentence ID#5
D

Done: 0.00% 	Sentence ID#1
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#12
Done: 0.04% 	Sentence ID#15
Done: 0.03% 	Sentence ID#10
Done: 0.03% 	Sentence ID#12
Done: 0.00% 	Sentence ID#1
Done: 0.04% 	Sentence ID#15
Done: 0.01% 	Sentence ID#5
Done: 0.02% 	Sentence ID#12
Done: 0.03% 	Sentence ID#10
Done: 0.00% 	Sentence ID#1
Done: 0.03% 	Sentence ID#12
Done: 0.04% 	Sentence ID#15
Done: 0.01% 	Sentence ID#5
Done: 0.03% 	Sentence ID#10
Done: 0.02% 	Sentence ID#12
Done: 0.00% 	Sentence ID#1
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#5
Done: 0.03% 	Sentence ID#12
Done: 0.03% 	Sentence ID#10
Done: 0.00% 	Sentence ID#1
Done: 0.02% 	Sentence ID#12
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#5
Done: 0.03% 	Sentence ID#10
Done: 0.03% 	Sentence ID#12
Done: 0.00% 	Sentence ID#1
Done: 0.02% 	Sentence ID#12
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#5
Done: 0.03% 	Sentence ID#10
Done: 0.03% 	Sentence ID#12
Done: 0.04% 	Sentence ID#16
Done: 0.00% 	Sentence ID#1
Done:

Done: 0.04% 	Sentence ID#18
Done: 0.02% 	Sentence ID#10
Done: 0.01% 	Sentence ID#2
Done: 0.04% 	Sentence ID#15
Done: 0.03% 	Sentence ID#17
Done: 0.01% 	Sentence ID#4
Done: 0.04% 	Sentence ID#13
Done: 0.04% 	Sentence ID#18
Done: 0.02% 	Sentence ID#11
Done: 0.01% 	Sentence ID#3
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#4
Done: 0.04% 	Sentence ID#13
Done: 0.03% 	Sentence ID#17
Done: 0.04% 	Sentence ID#18
Done: 0.01% 	Sentence ID#3
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#4
Done: 0.02% 	Sentence ID#11
Done: 0.04% 	Sentence ID#18
Done: 0.04% 	Sentence ID#13
Done: 0.03% 	Sentence ID#18
Done: 0.01% 	Sentence ID#4
Done: 0.04% 	Sentence ID#16
Done: 0.01% 	Sentence ID#3
Done: 0.02% 	Sentence ID#11
Done: 0.04% 	Sentence ID#18
Done: 0.03% 	Sentence ID#18
Done: 0.04% 	Sentence ID#13
Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#3
Done: 0.04% 	Sentence ID#18
Done: 0.03% 	Sentence ID#18
Done: 0.02% 	Sentence ID#11
Done: 0.04% 	Sentence ID#13
Done: 0.04% 	Sentence ID#16
Do

Done: 0.05% 	Sentence ID#21
Done: 0.02% 	Sentence ID#8
Done: 0.01% 	Sentence ID#1
Done: 0.01% 	Sentence ID#5
Done: 0.04% 	Sentence ID#16
Done: 0.05% 	Sentence ID#18
Done: 0.02% 	Sentence ID#8
Done: 0.03% 	Sentence ID#16
Done: 0.05% 	Sentence ID#21
Done: 0.00% 	Sentence ID#1
Done: 0.01% 	Sentence ID#1
Done: 0.01% 	Sentence ID#5
Done: 0.04% 	Sentence ID#21
Done: 0.04% 	Sentence ID#16
Done: 0.05% 	Sentence ID#18
Done: 0.03% 	Sentence ID#16
Done: 0.02% 	Sentence ID#8
Done: 0.05% 	Sentence ID#21
Done: 0.00% 	Sentence ID#1
Done: 0.01% 	Sentence ID#1
Done: 0.01% 	Sentence ID#5
Done: 0.04% 	Sentence ID#21
Done: 0.04% 	Sentence ID#16
Done: 0.03% 	Sentence ID#16
Done: 0.05% 	Sentence ID#18
Done: 0.02% 	Sentence ID#8
Done: 0.05% 	Sentence ID#22
Done: 0.00% 	Sentence ID#1
Done: 0.01% 	Sentence ID#5
Done: 0.01% 	Sentence ID#2
Done: 0.04% 	Sentence ID#21
Done: 0.04% 	Sentence ID#16
Done: 0.05% 	Sentence ID#18
Done: 0.02% 	Sentence ID#8
Done: 0.05% 	Sentence ID#22
Done: 0.03% 	Sentence ID#16
Done: 0.

Done: 0.01% 	Sentence ID#4
Done: 0.01% 	Sentence ID#3
Done: 0.06% 	Sentence ID#22
Done: 0.00% 	Sentence ID#2
Done: 0.05% 	Sentence ID#18
Done: 0.06% 	Sentence ID#24
Done: 0.04% 	Sentence ID#23
Done: 0.04% 	Sentence ID#18
Done: 0.02% 	Sentence ID#8
Done: 0.06% 	Sentence ID#22
Done: 0.01% 	Sentence ID#3
Done: 0.03% 	Sentence ID#10
Done: 0.05% 	Sentence ID#18
Done: 0.06% 	Sentence ID#24
Done: 0.01% 	Sentence ID#3
Done: 0.06% 	Sentence ID#22
Done: 0.01% 	Sentence ID#4
Done: 0.04% 	Sentence ID#23
Done: 0.00% 	Sentence ID#2
Done: 0.02% 	Sentence ID#8
Done: 0.04% 	Sentence ID#19
Done: 0.03% 	Sentence ID#10
Done: 0.06% 	Sentence ID#24
Done: 0.06% 	Sentence ID#22
Done: 0.04% 	Sentence ID#23
Done: 0.01% 	Sentence ID#3
Done: 0.04% 	Sentence ID#19
Done: 0.00% 	Sentence ID#2
Done: 0.02% 	Sentence ID#8
Done: 0.05% 	Sentence ID#18
Done: 0.01% 	Sentence ID#4
Done: 0.03% 	Sentence ID#10
Done: 0.06% 	Sentence ID#25
Done: 0.06% 	Sentence ID#22
Done: 0.01% 	Sentence ID#3
Done: 0.04% 	Sentence ID#19
Done: 

**Sanity Check**

In [20]:
total_tokens=0
for i in final_results:
    total_tokens += i.shape[0]
total_tokens

10561150

In [21]:
final_results[0].head(15)

Unnamed: 0,sentence_id,token_id,semiotic,before,after
0,0,0,PLAIN,Brillantaisia,Brillantaisia
1,0,1,PLAIN,is,is
2,0,2,PLAIN,a,a
3,0,3,PLAIN,genus,genus
4,0,4,PLAIN,of,of
5,0,5,PLAIN,plant,plant
6,0,6,PLAIN,in,in
7,0,7,PLAIN,family,family
8,0,8,PLAIN,Acanthaceae,Acanthaceae
9,0,9,PUNCT,.,.


In [22]:
raw_data.head(15)

Unnamed: 0,semiotic,before,after
0,PLAIN,Brillantaisia,<self>
1,PLAIN,is,<self>
2,PLAIN,a,<self>
3,PLAIN,genus,<self>
4,PLAIN,of,<self>
5,PLAIN,plant,<self>
6,PLAIN,in,<self>
7,PLAIN,family,<self>
8,PLAIN,Acanthaceae,<self>
9,PUNCT,.,sil


## Transforming 'after' tokens

From the above mentioned paper:
```
Semiotic class instances are verbalized as sequences
of fully spelled words, most ordinary words are left alone (rep-
resented here as <self>), and punctuation symbols are mostly
transduced to sil (for “silence”).
```
Hence we transform as follows:
1. sil is replaced with < self >
2. < self > is replaced with the before column

In [None]:
sil_mask = (data['after'] == 'sil')
data.loc[sil_mask, 'after'] = '<self>' 
self_mask = (data['after'] == '<self>')
data.loc[self_mask, ('after')] = data.loc[self_mask, 'before']

## Save Data

In [23]:
for i in range(len(final_results)):
    final_results[i].to_csv('processed{}'.format(i), index=False)

___