# Cleaning the data

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## While the raw data is stored in raw_data_dir, any newly created files are stored in proc_data_dir

In [2]:
cur_dir = Path.cwd()
data_dir = cur_dir / "data"
raw_data_dir = data_dir / "raw_data"
proc_data_dir = data_dir / "processed_data"
sample_file = raw_data_dir / "sample.tsv"
dev_file = raw_data_dir / "te.translit.sampled.dev.tsv"
train_file = raw_data_dir / "te.translit.sampled.train.tsv"
test_file = raw_data_dir / "te.translit.sampled.test.tsv"

In [3]:
names=["target","source", "frequency"]
types = ["string","string","int8"]
col_types = dict(zip(names,types))

## We transliterate the text from source script(roman) to target script(telugu)

In [4]:
df_train= pd.read_csv(train_file, sep='\t',header = None , names=names)
df_dev= pd.read_csv(dev_file, sep='\t',header = None , names=names)
df_test= pd.read_csv(test_file, sep='\t',header = None , names=names)
df_dev.head()

Unnamed: 0,target,source,frequency
0,అంక,amka,1
1,అంక,anka,3
2,అంకం,amkam,1
3,అంకం,ankam,2
4,అంగీకరించ,amgiikarimcha,1


In [5]:
df_train.info()
#One null object in source column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58550 entries, 0 to 58549
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     58550 non-null  object
 1   source     58549 non-null  object
 2   frequency  58550 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [6]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5683 entries, 0 to 5682
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     5683 non-null   object
 1   source     5683 non-null   object
 2   frequency  5683 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 133.3+ KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5747 entries, 0 to 5746
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     5747 non-null   object
 1   source     5747 non-null   object
 2   frequency  5747 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 134.8+ KB


In [9]:
df_train[df_train.source.isnull()]

Unnamed: 0,target,source,frequency
26313,నాన్,,1


In [10]:
df_train.dropna(inplace =True)
# Removing the row with null value
df_train.reset_index(drop=True, inplace =True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58549 entries, 0 to 58548
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     58549 non-null  object
 1   source     58549 non-null  object
 2   frequency  58549 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [11]:
Counter(df_train.target.sum()).keys()
#Last letter here is half space character

dict_keys(['అ', 'ం', 'క', 'ి', 'త', 'భ', 'ా', 'వ', 'మ', 'చ', '్', 'డ', 'ు', 'ర', 'ై', 'న', 'శ', 'ె', 'ల', 'గ', 'ద', 'ష', 'య', 'ీ', 'ే', 'స', 'ో', 'ళ', 'జ', 'ట', 'ూ', 'ః', 'ప', 'ఖ', 'ధ', 'థ', 'బ', 'ణ', 'ఠ', 'ౌ', 'హ', 'ఞ', 'ఫ', 'ొ', 'ృ', 'ఘ', 'ఆ', 'ఢ', 'ఇ', 'ఈ', 'ఉ', 'ఛ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఱ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'ఝ', '\u200c'])

In [12]:
Counter(df_dev.target.sum()).keys()

dict_keys(['అ', 'ం', 'క', 'గ', 'ీ', 'ర', 'ి', 'చ', 'డ', 'ా', 'న', 'ు', 'ద', 'ట', 'త', 'మ', 'ల', 'ో', 'వ', 'ూ', 'స', '్', 'బ', 'ష', 'ె', 'య', 'జ', 'ణ', 'ఞ', 'ధ', 'ై', 'ప', 'ే', 'భ', 'హ', 'ఖ', 'ృ', 'థ', 'శ', 'ఆ', 'ళ', 'ఫ', 'ఘ', 'ఇ', 'ఈ', 'ఉ', 'ఠ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ొ', 'ఓ', 'ఔ', 'ౌ', 'ఛ', 'ఢ'])

In [13]:
Counter(df_test.target.sum()).keys()

dict_keys(['అ', 'ం', 'క', 'ల', 'ో', 'ి', 'త', 'మ', 'ై', 'ె', 'న', 'ు', 'గ', 'ీ', 'ర', 'చ', 'ా', 'డ', 'ే', 'ట', 'వ', '్', 'ద', 'ప', 'జ', 'ణ', 'ఞ', 'ొ', 'ధ', 'హ', 'య', 'ష', 'భ', 'ూ', 'శ', 'స', 'బ', 'ృ', 'థ', 'ఖ', 'ఆ', 'ఇ', 'ఛ', 'ఈ', 'ఉ', 'ఊ', 'ళ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'ఠ', 'ౌ', 'ఘ', 'ఫ', 'ఢ', 'ః'])

In [14]:
half_space = list(Counter(df_train.target.sum()).keys())[-1]
half_space

'\u200c'

In [15]:
half_space_rows = np.flatnonzero(df_train.target.apply(lambda x: half_space in x))
df_train.loc[half_space_rows]
#Words with halfspace

Unnamed: 0,target,source,frequency
36088,ఫేస్‌బుక్,facebook,2
36089,ఫేస్‌బుక్,feesbuk,1
36090,ఫేస్‌బుక్,phaesbuk,1
36091,ఫేస్‌బుక్,pheesbuk,1
46034,లోక్‌సభ,loeksabha,1
46035,లోక్‌సభ,loksabha,3


In [19]:
df_train.loc[half_space_rows].target.apply(lambda x: list(x))


36088    [ఫ, ే, స, ్, ‌, బ, ు, క, ్]
36089    [ఫ, ే, స, ్, ‌, బ, ు, క, ్]
36090    [ఫ, ే, స, ్, ‌, బ, ు, క, ్]
36091    [ఫ, ే, స, ్, ‌, బ, ు, క, ్]
46034          [ల, ో, క, ్, ‌, స, భ]
46035          [ల, ో, క, ్, ‌, స, భ]
Name: target, dtype: object

In [22]:
halfspaced_words = df_train.loc[half_space_rows,"target"].unique().tolist()
halfspaced_words

['ఫేస్\u200cబుక్', 'లోక్\u200cసభ']

In [23]:
def remove_halfspace(word,char):
    chars = list(word)
    chars.remove(char)
    return "".join(chars)
#Function to remove half space from words and rejoin them
revised_words = [remove_halfspace(word,half_space) for word in halfspaced_words]
revised_words

['ఫేస్బుక్', 'లోక్సభ']

In [24]:
correction_dict = dict(zip(halfspaced_words,revised_words))
correction_dict

{'ఫేస్\u200cబుక్': 'ఫేస్బుక్', 'లోక్\u200cసభ': 'లోక్సభ'}

In [26]:
df_train.replace(to_replace = correction_dict, inplace=True)
#Replacing half spaced words with corrected words
df_train.loc[half_space_rows]

Unnamed: 0,target,source,frequency
36088,ఫేస్బుక్,facebook,2
36089,ఫేస్బుక్,feesbuk,1
36090,ఫేస్బుక్,phaesbuk,1
36091,ఫేస్బుక్,pheesbuk,1
46034,లోక్సభ,loeksabha,1
46035,లోక్సభ,loksabha,3


In [29]:
trn_tgt_alphabets = set(Counter(df_train.target.sum()).keys())
dev_tgt_alphabets = set(Counter(df_dev.target.sum()).keys())
tst_tgt_alphabets = set(Counter(df_test.target.sum()).keys())

In [30]:
trn_tgt_alphabets^dev_tgt_alphabets,trn_tgt_alphabets - dev_tgt_alphabets
#Checking if training set characters are superset of dev and and test set charcters

({'ః', 'ఝ', 'ఱ'}, {'ః', 'ఝ', 'ఱ'})

In [31]:
trn_tgt_alphabets^tst_tgt_alphabets,trn_tgt_alphabets - tst_tgt_alphabets

({'ఝ', 'ఱ'}, {'ఝ', 'ఱ'})

### So, alphabets of training set  are superset of that of dev set and test set for targets

In [32]:
print(sorted(Counter(df_train.source.sum()).keys()))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [33]:
print(sorted(Counter(df_dev.source.sum()).keys()))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [34]:
print(sorted(Counter(df_test.source.sum()).keys()))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [35]:
cleaned_train_file = proc_data_dir / "train_clean.tsv"
df_train.to_csv(cleaned_train_file, sep='\t', header = False, index=False)