# Creating differnet datasets using the raw data.

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from pathlib import Path
from scripts.parser_utils import file_parser,write_repeat_files,write_weighted_files,write_max_files,create_repeat_data,\
                        create_weighted_data,create_max_data,create_max_weighted_and_repeated_tsv,normalize,create_corpus

In [2]:
cur_dir = Path.cwd()
data_dir = cur_dir / "data"
raw_data_dir = data_dir / "raw_data"
proc_data_dir = data_dir / "processed_data"

sample_file = raw_data_dir / "sample.tsv"

train_file = proc_data_dir / "train_clean.tsv"
dev_file = raw_data_dir / "te.translit.sampled.dev.tsv"
test_file = raw_data_dir / "te.translit.sampled.test.tsv"

weighted_sample_file = proc_data_dir / "weighted_sample.tsv"
max_sample_file = proc_data_dir / "max_sample.tsv"
repeat_sample_file = proc_data_dir / "repeat_sample.tsv"

weighted_dev_file = proc_data_dir / "weighted_dev.tsv"
max_dev_file = proc_data_dir / "max_dev.tsv"
repeat_dev_file = proc_data_dir / "repeat_dev.tsv"

weighted_train_file = proc_data_dir / "weighted_train.tsv"
max_train_file = proc_data_dir / "max_train.tsv"
repeat_train_file = proc_data_dir / "repeat_train.tsv"

weighted_test_file = proc_data_dir / "weighted_test.tsv"
max_test_file = proc_data_dir / "max_test.tsv"
repeat_test_file = proc_data_dir / "repeat_test.tsv"

tgt_corpus_file =  proc_data_dir / "target_corpus.txt"
src_corpus_file = proc_data_dir / "source_corpus.txt"

In [3]:
df_sample= pd.read_csv(sample_file, sep='\t',header = None , names=["target","source", "frequency"])
df_sample.head()

Unnamed: 0,target,source,frequency
0,అంక,amka,1
1,అంక,anka,3
2,అంకం,amkam,1
3,అంకం,ankam,2
4,అంగీకరించ,amgiikarimcha,1


In [4]:
with open(sample_file, 'r', encoding='utf-8') as f:
    for line in f:
        print (line.strip().split("\t"))

['అంక', 'amka', '1']
['అంక', 'anka', '3']
['అంకం', 'amkam', '1']
['అంకం', 'ankam', '2']
['అంగీకరించ', 'amgiikarimcha', '1']
['అంగీకరించ', 'angeekarincha', '2']
['అంగీకరించడం', 'angeekarinchadam', '1']
['అంగీకరించడం', 'angikarinchadam', '3']
['అంగీకరించడానికి', 'angeekarinchadaniki', '2']
['అంగీకరించడానికి', 'angikarinchadaniki', '1']
['అంగీకరించని', 'angeekarinchani', '2']
['అంగీకరించని', 'angikarinchani', '1']
['అంగీకరించారు', 'angeekarinchaaru', '1']
['అంగీకరించారు', 'angeekarincharu', '1']
['అంగీకరించారు', 'angikarinchaaru', '2']
['అంగీకరించింది', 'angeekarinchidi', '1']
['అంగీకరించింది', 'angeekarinchindhi', '1']
['అంగీకరించింది', 'angikarinchindi', '1']
['అంటరాని', 'antaraani', '3']
['అంతం', 'anatham', '1']
['అంతం', 'antam', '1']
['అంతం', 'antham', '1']
['అంతము', 'antamu', '2']
['అంతము', 'anthamu', '3']


In [5]:
sample_dict = file_parser(sample_file)
sample_dict

{'అంక': [('amka', 1), ('anka', 3)],
 'అంకం': [('amkam', 1), ('ankam', 2)],
 'అంగీకరించ': [('amgiikarimcha', 1), ('angeekarincha', 2)],
 'అంగీకరించడం': [('angeekarinchadam', 1), ('angikarinchadam', 3)],
 'అంగీకరించడానికి': [('angeekarinchadaniki', 2), ('angikarinchadaniki', 1)],
 'అంగీకరించని': [('angeekarinchani', 2), ('angikarinchani', 1)],
 'అంగీకరించారు': [('angeekarinchaaru', 1),
  ('angeekarincharu', 1),
  ('angikarinchaaru', 2)],
 'అంగీకరించింది': [('angeekarinchidi', 1),
  ('angeekarinchindhi', 1),
  ('angikarinchindi', 1)],
 'అంటరాని': [('antaraani', 3)],
 'అంతం': [('anatham', 1), ('antam', 1), ('antham', 1)],
 'అంతము': [('antamu', 2), ('anthamu', 3)]}

In [6]:
normalize((1,3))

(0.25, 0.75)

In [7]:
weighted_sample_data = create_weighted_data(sample_dict)
weighted_sample_data

{'అంక': [('amka', 0.25), ('anka', 0.75)],
 'అంకం': [('amkam', 0.333), ('ankam', 0.667)],
 'అంగీకరించ': [('amgiikarimcha', 0.333), ('angeekarincha', 0.667)],
 'అంగీకరించడం': [('angeekarinchadam', 0.25), ('angikarinchadam', 0.75)],
 'అంగీకరించడానికి': [('angeekarinchadaniki', 0.667),
  ('angikarinchadaniki', 0.333)],
 'అంగీకరించని': [('angeekarinchani', 0.667), ('angikarinchani', 0.333)],
 'అంగీకరించారు': [('angeekarinchaaru', 0.25),
  ('angeekarincharu', 0.25),
  ('angikarinchaaru', 0.5)],
 'అంగీకరించింది': [('angeekarinchidi', 0.333),
  ('angeekarinchindhi', 0.333),
  ('angikarinchindi', 0.333)],
 'అంటరాని': [('antaraani', 1.0)],
 'అంతం': [('anatham', 0.333), ('antam', 0.333), ('antham', 0.333)],
 'అంతము': [('antamu', 0.4), ('anthamu', 0.6)]}

In [8]:
max_sample_data = create_max_data(sample_dict)
max_sample_data

{'అంక': 'anka',
 'అంకం': 'ankam',
 'అంగీకరించ': 'angeekarincha',
 'అంగీకరించడం': 'angikarinchadam',
 'అంగీకరించడానికి': 'angeekarinchadaniki',
 'అంగీకరించని': 'angeekarinchani',
 'అంగీకరించారు': 'angikarinchaaru',
 'అంగీకరించింది': 'angeekarinchidi',
 'అంటరాని': 'antaraani',
 'అంతం': 'anatham',
 'అంతము': 'anthamu'}

In [9]:
repeat_sample_data = create_repeat_data(sample_dict)
repeat_sample_data

{'అంక': ['amka', 'anka', 'anka', 'anka'],
 'అంకం': ['amkam', 'ankam', 'ankam'],
 'అంగీకరించ': ['amgiikarimcha', 'angeekarincha', 'angeekarincha'],
 'అంగీకరించడం': ['angeekarinchadam',
  'angikarinchadam',
  'angikarinchadam',
  'angikarinchadam'],
 'అంగీకరించడానికి': ['angeekarinchadaniki',
  'angeekarinchadaniki',
  'angikarinchadaniki'],
 'అంగీకరించని': ['angeekarinchani', 'angeekarinchani', 'angikarinchani'],
 'అంగీకరించారు': ['angeekarinchaaru',
  'angeekarincharu',
  'angikarinchaaru',
  'angikarinchaaru'],
 'అంగీకరించింది': ['angeekarinchidi', 'angeekarinchindhi', 'angikarinchindi'],
 'అంటరాని': ['antaraani', 'antaraani', 'antaraani'],
 'అంతం': ['anatham', 'antam', 'antham'],
 'అంతము': ['antamu', 'antamu', 'anthamu', 'anthamu', 'anthamu']}

In [10]:
%%time
create_max_weighted_and_repeated_tsv(sample_file,max_sample_file,weighted_sample_file, repeat_sample_file)

Wall time: 34.6 ms


In [11]:
%%time
create_max_weighted_and_repeated_tsv(dev_file,max_dev_file,weighted_dev_file, repeat_dev_file)
create_max_weighted_and_repeated_tsv(train_file,max_train_file,weighted_train_file, repeat_train_file)
create_max_weighted_and_repeated_tsv(test_file,max_test_file,weighted_test_file, repeat_test_file)

Wall time: 1.71 s


In [12]:
df_weighted_train= pd.read_csv(weighted_train_file, sep='\t',header = None , names=["target","source", "weights"])
df_weighted_train.head()

Unnamed: 0,target,source,weights
0,అంకిత,amkita,0.2
1,అంకిత,ankita,0.4
2,అంకిత,ankitha,0.4
3,అంకితం,ankitam,0.333
4,అంకితం,ankitham,0.667


In [14]:
df_repeat_train= pd.read_csv(repeat_train_file, sep='\t',header = None , names=["target","source"])
df_repeat_train.head()

Unnamed: 0,target,source
0,అంకిత,amkita
1,అంకిత,ankita
2,అంకిత,ankita
3,అంకిత,ankitha
4,అంకిత,ankitha


In [15]:
df_max_train= pd.read_csv(max_train_file, sep='\t',header = None , names=["target","source"])
df_max_train.head()

Unnamed: 0,target,source
0,అంకిత,ankita
1,అంకితం,ankitham
2,అంకితభావం,ankithabhavam
3,అంకితమిచ్చాడు,ankitamichhaadu
4,అంకితమిచ్చారు,amkithamichaaru


In [16]:
df_train= pd.read_csv(train_file, sep='\t',header = None , names=["target","source", "frequency"])
df_train.head()

Unnamed: 0,target,source,frequency
0,అంకిత,amkita,1
1,అంకిత,ankita,2
2,అంకిత,ankitha,2
3,అంకితం,ankitam,1
4,అంకితం,ankitham,2


In [27]:
create_corpus(train_file,src_corpus_file,tgt_corpus_file)