<a href="https://colab.research.google.com/github/evatsirmi/Fine-tuning-for-Machine-Translation-/blob/main/data_pre-processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#all the tsv files have two columns, 'el' and 'en' for greek and english sentences respectively
#every line contains a Greek-English sentence pair

import pandas as pd
import re
!pip install "numpy<2.0" # this version of numpy is installed in order for the fasttext-langdetect model to run

#df = pd.read_csv('/content/drive/MyDrive/NeuLab_el_en_aligned.tsv', sep='\t', encoding = 'utf-8')


#df = pd.read_csv('/content/drive/MyDrive/SciPar_el_en_aligned.tsv', sep='\t', encoding = 'utf-8')


df = pd.read_csv('/content/drive/MyDrive/Europarl_el_en_aligned.tsv', sep='\t',encoding = 'utf-8')

df.head()



Unnamed: 0,el,en
0,Κηρύσσω την επανάληψη της συνόδου του Ευρωπαϊκ...,I declare resumed the session of the European ...
1,"Όπως μπορέσατε να διαπιστώσετε, ο περίφημος ""ι...","Although, as you will have seen, the dreaded'm..."
2,Επιθυμείτε μία συζήτηση επί του θέματος τις επ...,You have requested a debate on this subject in...
3,"Επί του παρόντος θα ήθελα, όπως μου ζήτησαν ορ...","In the meantime, I should like to observe a mi..."
4,Σας καλώ να σηκωθείτε για αυτή την ενός λεπτού...,"Please rise, then, for this minute 's silence."


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#any unwanted character is deleted, only greek and latin letters, numbers,punctuation and some relevant symbols are kept
#repeated punctuation is also removed

def remove_unwanted_char(string):
 clean_text=re.compile(r"[^a-zA-ZΑ-Ωα-ωάέύίόήώϊϋΐΰΈΎΌΊΆΉΏΪΫ0-9\s.,!?;:\/'\"()\-–—…«»%·€$]") #I create a pattern
 cleaning=clean_text.sub("",string) #I apply the pattern in the text, if it's found it's replaced by ""
 clean_punct = re.sub(r'([!?,;:\'"()\-–—«»\/])\1+', r'\1', cleaning)#any repeated punctuation is replced by 1 token of itself
 clean_punct=clean_punct.strip() #I use .strip() to remove extra empty spaces
 return clean_punct

In [None]:
columns = ["el", 'en']
for column in columns:
  df[column] = df[column].apply(remove_unwanted_char)

In [None]:
#removal of html/tags
def remove_html_tags(text):
  return re.sub(r'<[^>]+>', '', text)


columns = ["el", 'en']
for column in columns:
  df[column] = df[column].apply(remove_html_tags)



In [None]:
#removal of duplicate pairs
#removal of duplicates in each column separately
#If there is a duplicate in one column, the entire pair is deleted


df = df.drop_duplicates(subset=['el', 'en'])


duplicated_rows = df.duplicated(subset=['el'], keep = False) | df.duplicated(subset=['en'], keep = False)
df = df[~duplicated_rows].reset_index(drop=True)

In [None]:
!pip install datasketch tqdm



In [None]:
#removal of near duplicates using Minhash
#Minhash is a technique that represents each sentence as a set of text shingles and generates a ‘signature’ that preserves similarity
#if two sentences have high jaccard similarity, they produce a similar Minhash signature
#If one sentence in a pair is a near-duplicate, the entire pair is removed


import pandas as pd
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm

def text_to_minhash(text, num_perm=128):

    m = MinHash(num_perm=num_perm)
    for word in str(text).split():
        m.update(word.encode("utf8"))
    return m

def remove_near_duplicates_rowwise(df, columns, threshold=0.85, num_perm=128):

    all_drop_indices = set()

    for col in columns:
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
        keep_indices = []

        print(f"Processing {len(df)} rows for column '{col}'...")
        for i, text in tqdm(df[col].items(), total=len(df)):
            m = text_to_minhash(text, num_perm)
            if not lsh.query(m):
                lsh.insert(i, m)
                keep_indices.append(i)
            else:
                all_drop_indices.add(i)


    df_cleaned = df.drop(all_drop_indices).reset_index(drop=True)
    print(f"Kept {len(df_cleaned)} rows out of {len(df)}")
    return df_cleaned





df = remove_near_duplicates_rowwise(df, columns=['el','en'], threshold=0.85)

Processing 637021 rows for column 'el'...


100%|██████████| 637021/637021 [20:35<00:00, 515.40it/s]


Processing 637021 rows for column 'en'...


100%|██████████| 637021/637021 [20:34<00:00, 515.98it/s]


Kept 635482 rows out of 637021


In [None]:
#removal of sentences with a large length difference
def split_sentences(x):
  return len(x.split())

df['greek'] = df['el'].apply(split_sentences)
df['english'] = df['en'].apply(split_sentences)


In [None]:
filtered_rows = []
for index, row in df.iterrows():
  greek_row = row['greek']
  english_row = row['english']
  if (greek_row <= 3 * english_row) and (english_row <= 3 * greek_row):
    filtered_rows.append(row)
  else:
    pass



In [None]:
df = pd.DataFrame(filtered_rows)
df = df.drop(columns=['greek', 'english'])

In [None]:
#all sentences longer than 100 words or shorter that 3 are removed in both columns

def unwanted_rows(dataframe,columns):
  rows_to_drop = []
  for column in columns:
    for index, string in dataframe[column].items(): #I access the index and the content of each row
      num_words = len(str(string).split()) #I tokenize the text into words
      if num_words>=100 or num_words<=3:
        rows_to_drop.append(index) # if the text satisfies the above criteria its index is appended in the list
  dataframe=dataframe.drop(rows_to_drop) #these rows are deleted from the dataframe
  dataframe=dataframe.reset_index(drop=True) #as the rows are deleted this helps keeping the correct indexing
  return dataframe

In [None]:
df = unwanted_rows(df,columns)

In [None]:
!pip install fasttext-langdetect




In [None]:
import numpy as np
import fasttext #language detection model
from huggingface_hub import hf_hub_download

#I download the model from huggingface
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#I detect the language of each column based on the fasttext-langdetect model

def detect_language(string):
  detected_lang=model.predict(string) #the model predicts the language in the given text
  label=detected_lang[0][0] #it returns a list of tuples, and I only access the first element which is the language
  return label

In [None]:
df["detected_el"]=df['el'].apply(detect_language)
df["detected_en"]=df['en'].apply(detect_language)

In [None]:
#I remove the rows where the language is not the intented one

def wrong_language(dataframe,column,text): #given a dataframe, a column and a text
  indices=[]
  for index, string in dataframe[column].items(): #i access the index and the text of each row
    if string!= text: #if the text if not identical to the given one
      indices.append(index) #i append the index of this row in the list
  dataframe=dataframe.drop(indices) #then i delete the rows with that index
  dataframe=dataframe.reset_index(drop=True)#as the rows are deleted this helps keeping the correct indexing
  return dataframe

In [None]:
df = wrong_language(df, "detected_el", "__label__ell_Grek")
df = wrong_language(df, "detected_en", "__label__eng_Latn")

In [None]:
df.to_csv("final.csv", sep=";", index=False, encoding="utf-8-sig")

In [None]:
df.shape

(619184, 4)