### Loading data from file

In [1]:
import pandas as pd

# Your existing data loading code
data = []

with open('parallel-n/IITB.en-hi.en', 'r', encoding='utf-8') as f_en, \
     open('parallel-n/IITB.en-hi.hi', 'r', encoding='utf-8') as f_hi:

    for en_line, hi_line in zip(f_en, f_hi):
        en_tokens = en_line.strip().split()
        hi_tokens = hi_line.strip().split()
        data.append((en_tokens, hi_tokens))

# Convert to DataFrame
df = pd.DataFrame(data, columns=['en_tokens', 'hi_tokens'])

# Show the first few rows
print(df.head())


                                           en_tokens  \
0  [Give, your, application, an, accessibility, w...   
1              [Accerciser, Accessibility, Explorer]   
2  [The, default, plugin, layout, for, the, botto...   
3  [The, default, plugin, layout, for, the, top, ...   
4  [A, list, of, plugins, that, are, disabled, by...   

                                           hi_tokens  
0  [अपने, अनुप्रयोग, को, पहुंचनीयता, व्यायाम, का,...  
1                [एक्सेर्साइसर, पहुंचनीयता, अन्वेषक]  
2      [निचले, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]  
3       [ऊपरी, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]  
4  [उन, प्लग-इनों, की, सूची, जिन्हें, डिफोल्ट, रू...  


In [2]:
# size of dataset
print(df.shape)

(1659083, 2)


In [3]:
# Lower casing the english records
df['en_tokens'] = df['en_tokens'].apply(lambda tokens: [token.lower() for token in tokens])

In [4]:
# Counting length of each records
df['en_length'] = df['en_tokens'].apply(len)
df['hi_length'] = df['hi_tokens'].apply(len)


#### Length wise distribution of data

In [5]:
import numpy as np

# Define bins (you can adjust the range and step if needed)
bins = list(range(0, 105, 5))  # 0–5, 5–10, ..., 100+
labels = [f"{i}-{i+5}" for i in bins[:-1]]

# Bin the lengths
df['en_bin'] = pd.cut(df['en_length'], bins=bins, labels=labels, right=False)
df['hi_bin'] = pd.cut(df['hi_length'], bins=bins, labels=labels, right=False)

# Count frequencies
en_dist = df['en_bin'].value_counts().sort_index()
hi_dist = df['hi_bin'].value_counts().sort_index()

# Print the distribution
print(f"{'Length Range':<10} | {'English':>8} | {'Hindi':>8}")
print("-" * 32)
for label in labels:
    en_count = en_dist.get(label, 0)
    hi_count = hi_dist.get(label, 0)
    print(f"{label:<10} | {en_count:>8} | {hi_count:>8}")


Length Range |  English |    Hindi
--------------------------------
0-5        |   593705 |   578703
5-10       |   281225 |   268038
10-15      |   234844 |   225632
15-20      |   167756 |   166374
20-25      |   121542 |   122311
25-30      |    84216 |    88680
30-35      |    56216 |    61801
35-40      |    36188 |    40461
40-45      |    23667 |    28146
45-50      |    15382 |    19852
50-55      |    10953 |    14001
55-60      |     7848 |    10055
60-65      |     5771 |     7240
65-70      |     4033 |     5704
70-75      |     3188 |     4180
75-80      |     2292 |     3412
80-85      |     1877 |     2558
85-90      |     1429 |     2184
90-95      |     1098 |     1746
95-100     |      998 |     1242


#### Only keeping records between length 5 and 30

In [6]:
filtered_df = df[(df['en_length'] >= 5) & (df['en_length'] <= 30) &
                 (df['hi_length'] >= 5) & (df['hi_length'] <= 30)]

# Show filtered size and preview
print(f"Total records after filtering: {len(filtered_df)}")
print(filtered_df.head())


Total records after filtering: 815825
                                           en_tokens  \
0  [give, your, application, an, accessibility, w...   
2  [the, default, plugin, layout, for, the, botto...   
3  [the, default, plugin, layout, for, the, top, ...   
4  [a, list, of, plugins, that, are, disabled, by...   
6  [the, duration, of, the, highlight, box, when,...   

                                           hi_tokens  en_length  hi_length  \
0  [अपने, अनुप्रयोग, को, पहुंचनीयता, व्यायाम, का,...          6          8   
2      [निचले, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]          8          7   
3       [ऊपरी, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]          8          7   
4  [उन, प्लग-इनों, की, सूची, जिन्हें, डिफोल्ट, रू...          9         12   
6  [पहुंचनीय, आसंधि, (नोड), को, चुनते, समय, हाइला...         10         10   

  en_bin hi_bin  
0   5-10   5-10  
2   5-10   5-10  
3   5-10   5-10  
4   5-10  10-15  
6  10-15  10-15  


#### Removing duplicates

In [7]:
# Make a copy to avoid SettingWithCopyWarning
filtered_df = filtered_df.copy()

# Convert token lists to strings for duplication check
filtered_df['en_str'] = filtered_df['en_tokens'].apply(lambda x: ' '.join(x))
filtered_df['hi_str'] = filtered_df['hi_tokens'].apply(lambda x: ' '.join(x))

# Find duplicates
duplicates = filtered_df[filtered_df.duplicated(subset=['en_str', 'hi_str'], keep=False)]

# Print result
print(f"Total duplicate records: {len(duplicates)}")
print(duplicates[['en_str', 'hi_str']].head())


Total duplicate records: 105443
                                              en_str  \
0     give your application an accessibility workout   
2     the default plugin layout for the bottom panel   
3        the default plugin layout for the top panel   
4     a list of plugins that are disabled by default   
6  the duration of the highlight box when selecti...   

                                              hi_str  
0    अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें  
2              निचले पटल के लिए डिफोल्ट प्लग-इन खाका  
3               ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका  
4  उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...  
6  पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्स...  


In [8]:
# before reming duplicate
filtered_df.shape

(815825, 8)

In [9]:
# droping duplicates
filtered_df = filtered_df.drop_duplicates(subset=['en_str', 'hi_str']).reset_index(drop=True)

In [10]:
filtered_df = filtered_df[['en_tokens', 'hi_tokens']]

In [11]:
# unique data
filtered_df.shape

(744797, 2)

In [12]:
filtered_df['en_tokens'].iloc[:100]

0     [give, your, application, an, accessibility, w...
1     [the, default, plugin, layout, for, the, botto...
2     [the, default, plugin, layout, for, the, top, ...
3     [a, list, of, plugins, that, are, disabled, by...
4     [the, duration, of, the, highlight, box, when,...
                            ...                        
95            [move, ~, a, onto, an, empty, top, slot.]
96         [move, ~, a, onto, an, empty, bottom, slot.]
97           [move, ~, a, onto, an, empty, left, slot.]
98          [move, ~, a, onto, an, empty, right, slot.]
99                 [move, ~, a, onto, an, empty, slot.]
Name: en_tokens, Length: 100, dtype: object

In [13]:
filtered_df['hi_tokens'].iloc[:100]

0     [अपने, अनुप्रयोग, को, पहुंचनीयता, व्यायाम, का,...
1         [निचले, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]
2          [ऊपरी, पटल, के, लिए, डिफोल्ट, प्लग-इन, खाका]
3     [उन, प्लग-इनों, की, सूची, जिन्हें, डिफोल्ट, रू...
4     [पहुंचनीय, आसंधि, (नोड), को, चुनते, समय, हाइला...
                            ...                        
95    [~, a, को, एक, खाली, शीर्ष, स्लॉट, में, ले, जा...
96    [~, a, को, एक, खाली, नीचे, स्लॉट, में, ले, जाएँ.]
97    [~, a, को, एक, खाली, बाएं, स्लॉट, में, ले, जाएँ.]
98    [~, a, को, एक, खाली, दाएँ, स्लॉट, में, ले, जाएँ.]
99          [~, a, को, एक, खाली, स्लॉट, में, ले, जाएँ.]
Name: hi_tokens, Length: 100, dtype: object

#### Cleaning daa and finding vocab size

In [14]:
import re

# Clean and build English vocab
en_vocab_set = set()
for tokens in filtered_df['en_tokens']:
    # Clean each token
    cleaned = [re.sub(r'[^a-z\s]', '', t.lower()) for t in tokens if t]
    cleaned = [t for t in cleaned if t]
    en_vocab_set.update(cleaned)

# Clean and build Hindi vocab
hi_vocab_set = set()
for tokens in filtered_df['hi_tokens']:
    cleaned = [re.sub(r'[^\u0900-\u097F\s।]', '', t) for t in tokens if t]
    cleaned = [t for t in cleaned if t]
    hi_vocab_set.update(cleaned)

# Add special tokens and sort
en_vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + sorted(en_vocab_set)
hi_vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + sorted(hi_vocab_set)

# Print sizes
print(f"English vocab size: {len(en_vocab)}")
print(f"Hindi vocab size: {len(hi_vocab)}")


English vocab size: 137908
Hindi vocab size: 246824


In [15]:
print(en_vocab[:100])

['<pad>', '<sos>', '<eos>', '<unk>', 'a', 'aa', 'aaa', 'aaad', 'aaagnf', 'aaar', 'aaarti', 'aab', 'aaber', 'aabu', 'aac', 'aacharan', 'aacharati', 'aacharya', 'aache', 'aachha', 'aachman', 'aachshaaya', 'aacr', 'aacrii', 'aactpq', 'aad', 'aadalf', 'aadam', 'aadanthy', 'aadepigmented', 'aadh', 'aadha', 'aadhaar', 'aadhaarin', 'aadhar', 'aadharbased', 'aadharlinked', 'aadheenams', 'aadhi', 'aadhunik', 'aadi', 'aadikavya', 'aadim', 'aadiparva', 'aadivasi', 'aadmi', 'aads', 'aadvise', 'aadyasevak', 'aaea', 'aaeene', 'aaeli', 'aaen', 'aag', 'aagam', 'aagamas', 'aagamshastra', 'aagara', 'aagosh', 'aagra', 'aah', 'aahamadiya', 'aahe', 'aahwan', 'aai', 'aaifr', 'aain', 'aaine', 'aainst', 'aaishwarya', 'aaj', 'aajach', 'aajad', 'aajan', 'aajeevika', 'aajkal', 'aajmagadh', 'aakaar', 'aakashdeep', 'aakasher', 'aakashwani', 'aakbari', 'aakhr', 'aakhri', 'aakhus', 'aakhusa', 'aakraneox', 'aalam', 'aalanguchchi', 'aalborg', 'aale', 'aalh', 'aali', 'aalims', 'aam', 'aamaththoor', 'aamba', 'aamer', 'a

In [16]:
print(hi_vocab[:100])

['<pad>', '<sos>', '<eos>', '<unk>', 'ं', 'ंं', 'ंंउ', 'ंंएम', 'ंंष्', 'ंंष्आरोग्य', 'ंंहूऍं', 'ंअइन्लिने', 'ंअकेर्', 'ंअटेर्निट्य्', 'ंअन्च्हेस्टेर्', 'ंआ', 'ंइस', 'ंईश्थ्', 'ंउस्ट्', 'ंएअस्लेस्', 'ंएडिचल्', 'ंएसोअमेरिचन', 'ंऐणीण्घीठीश्', 'ंओं', 'ंओरेचम्बे', 'ंके', 'ंकोई', 'ंजो', 'ंडऋए', 'ंड़ा', 'ंड़ी', 'ंड़े', 'ंडिया', 'ंण्', 'ंदेश', 'ंदेशे', 'ंफ्श्', 'ंबं', 'ंमम्स', 'ंमीसल्स्', 'ंमेरे', 'ंष्ठ्फ्', 'ंसूचना', 'ंसे', 'ः', 'ःंशौ', 'ःइ', 'ःइआईढ्श्', 'ःइग्हऋ', 'ःइग्ह्', 'ःऊघ्', 'ःएअल्ट्हऋ', 'ःएअल्ट्ह्', 'ःएल्प्', 'ःओउसिन्ग्', 'ःओउसे', 'ःओत्', 'ःओमे', 'ःग्', 'ःघ्', 'ःछ्', 'ःशा', 'ःशै', 'ःश्', 'ःष्फ्', 'ः।', 'अ', 'अँ', 'अँकुरित', 'अँकुरितकिण्वित', 'अँग', 'अँगड़ाई', 'अँगरेज़', 'अँगरेज़ी', 'अँगरेजी', 'अँगीठी', 'अँगुठे', 'अँगुलियाँ', 'अँगुलियां', 'अँगुलियों', 'अँगुली', 'अँगूठा', 'अँगूठियां।', 'अँगूठी', 'अँगूठे', 'अँगों', 'अँग्रज़', 'अँग्रेज', 'अँग्रेज़', 'अँग्रेज़ी', 'अँग्रेज़ीसिंधी', 'अँग्रेज़ीसिंधीकोश', 'अँग्रेज़ों', 'अँग्रेजी', 'अँग्रेजी़ः', 'अँग्रेजों', 'अँग्रेज़', 'अँग्रेज़ी', 'अँघेरा', 'अँ

In [17]:
# converting data to [[<english tokens>,[<hindi token>]]
parallel_data = filtered_df.apply(lambda row: [row['en_tokens'], row['hi_tokens']], axis=1).tolist()
print(parallel_data[:2])

[[['give', 'your', 'application', 'an', 'accessibility', 'workout'], ['अपने', 'अनुप्रयोग', 'को', 'पहुंचनीयता', 'व्यायाम', 'का', 'लाभ', 'दें']], [['the', 'default', 'plugin', 'layout', 'for', 'the', 'bottom', 'panel'], ['निचले', 'पटल', 'के', 'लिए', 'डिफोल्ट', 'प्लग-इन', 'खाका']]]


#### Vocab too big for us so we will use SentencePieceTrainer.bpe to generate vocab size of 30000.

In [23]:
import re
import sentencepiece as spm
from pathlib import Path

# Your data: [[english_tokens, hindi_tokens], ...]
# Example: data = [["i", "am", "happy"], ["मैं", "खुश", "हूँ"]]

def merge(data):
    """Clean English and Hindi tokens in the data."""
    merged_data = []
    for eng_tokens, hin_tokens in data:
        merged_data.append([' '.join(eng_tokens), ' '.join(hin_tokens)])  # Join tokens back to sentences
    return merged_data

def train_tokenizer(cleaned_data, vocab_size=14000, retrain=False, model_prefix='en_hi'):
    """Train SentencePiece tokenizer and create vocab."""
    if retrain:
        # Write English and Hindi to temporary files
        with open('temp_en.txt', 'w', encoding='utf-8') as f_en, \
             open('temp_hi.txt', 'w', encoding='utf-8') as f_hi:
            for eng_sent, hin_sent in cleaned_data:
                f_en.write(eng_sent + '\n')
                f_hi.write(hin_sent + '\n')
        
        # Train SentencePiece model
        spm.SentencePieceTrainer.train(
            input='temp_en.txt,temp_hi.txt',  # Combined English and Hindi
            model_prefix=model_prefix,
            vocab_size=vocab_size,
            character_coverage=1.0,  # Full coverage for Hindi
            model_type='bpe'  # Byte-Pair Encoding
        )
    
    # Load the trained tokenizer
    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model')
    return sp

def tokenize_data(cleaned_data, tokenizer):
    """Tokenize cleaned data using the trained tokenizer."""
    tokenized_data = []
    for eng_sent, hin_sent in cleaned_data:
        eng_tokens = tokenizer.encode_as_pieces(eng_sent)  # Subword tokens
        hin_tokens = tokenizer.encode_as_pieces(hin_sent)
        tokenized_data.append([eng_tokens, hin_tokens])
    return tokenized_data





In [19]:
parallel_data = merge(parallel_data)

In [24]:
# training tokenizer
tokenizer = train_tokenizer(parallel_data, 30000,retrain=False)

In [25]:
tokenized_data = tokenize_data(parallel_data, tokenizer)

In [26]:
print(tokenized_data[0])

[['▁give', '▁your', '▁application', '▁an', '▁accessibility', '▁work', 'out'], ['▁अपने', '▁अनुप्रयोग', '▁को', '▁पहुंच', 'नीयता', '▁व्यायाम', '▁का', '▁लाभ', '▁दें']]


#### Adding `<bos>`,`<eos>`,`<pad>` to records, making them of 32 length. Also encoding data into ids.

In [27]:
import numpy as np

def tokens_to_ids(tokenized_data, tokenizer, max_len=32):
    """Convert tokenized data to numerical IDs with padding/truncation."""
    numerical_data = []

    vocab_size = tokenizer.get_piece_size()

    pad_id = tokenizer.pad_id()
    if pad_id == -1:
        pad_id = 0

    bos_id = tokenizer.bos_id()
    eos_id = tokenizer.eos_id()

    for eng_tokens, hin_tokens in tokenized_data:
        # Convert tokens to IDs
        eng_ids = tokenizer.encode_as_ids(' '.join(eng_tokens))
        hin_ids = tokenizer.encode_as_ids(' '.join(hin_tokens))

        # English: truncate to (max_len - 2), then add BOS and EOS
        eng_ids = eng_ids[:max_len - 2]
        eng_ids = [bos_id] + eng_ids + [eos_id]
        eng_ids += [pad_id] * (max_len - len(eng_ids))  # Final length = 32

        # Hindi: truncate to (max_len - 1), then add BOS and EOS
        hin_ids = hin_ids[:max_len - 1]
        hin_ids = [bos_id] + hin_ids + [eos_id]
        hin_ids += [pad_id] * (max_len + 1 - len(hin_ids))  # Final length = 33

        numerical_data.append([eng_ids, hin_ids])

    return numerical_data


def filter_by_length(numerical_data, tokenizer, min_len=10, max_len=32):
    """Filter out pairs where unpadded length is outside min_len to max_len."""
    filtered_data = []
    pad_id = tokenizer.pad_id()
    if pad_id == -1:
        pad_id = 0

    for eng_ids, hin_ids in numerical_data:
        eng_len = sum(1 for id_ in eng_ids if id_ != pad_id)
        hin_len = sum(1 for id_ in hin_ids if id_ != pad_id)

        if min_len <= eng_len <= max_len and min_len <= hin_len <= (max_len + 1):
            filtered_data.append([eng_ids, hin_ids])

    return filtered_data


In [28]:
numerical_data = tokens_to_ids(tokenized_data, tokenizer, max_len=32)

In [34]:
print(numerical_data[0])

[[1, 1707, 487, 3275, 175, 25935, 687, 596, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 354, 5509, 80, 1837, 39, 908, 61, 16873, 83, 1801, 3259, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [35]:
# Step 3: Convert to numpy arrays (optional, for tensor feeding)
numerical_arrays = np.array(numerical_data,dtype=object)  # Shape: (num_pairs, 2, max_len)

#### Saving final data as .npy file to make it reusable

In [36]:
np.save('numerical_data.npy', numerical_arrays)

In [37]:
numerical_arrays = np.load('numerical_data.npy', allow_pickle=True)
filtered_numerical_data = numerical_arrays.tolist()

from sklearn.model_selection import train_test_split
train_data, temp_data = train_test_split(numerical_arrays, test_size=0.05, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

np.save('train_data.npy', train_data)
np.save('val_data.npy', val_data)
np.save('test_data.npy', test_data)



In [38]:
test_data = np.load('test_data.npy', allow_pickle=True).tolist()
lenCount = {}
for en, hi in test_data:
    pos = en.index(2)
    if pos not in lenCount:
        lenCount[pos] = 0
    lenCount[pos] += 1
print(lenCount)

{30: 360, 28: 418, 13: 894, 31: 2252, 12: 892, 22: 643, 7: 383, 21: 683, 25: 590, 17: 804, 29: 410, 15: 891, 20: 740, 26: 515, 18: 762, 8: 544, 14: 930, 9: 751, 11: 932, 16: 834, 24: 566, 27: 509, 19: 743, 23: 590, 10: 835, 6: 149}
