<a href="https://colab.research.google.com/github/ffedox/pbr/blob/main/text_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
import os
import re

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
path = os.getcwd()
files = os.listdir(path)
files

['.config',
 'parallel_corpus_vg_en_it_9.xlsx',
 'parallel_corpus_vg_en_it_11.xlsx',
 'parallel_corpus_vg_en_it_2.xlsx',
 'parallel_corpus_vg_en_it_3.xlsx',
 'parallel_corpus_vg_en_it_12.xlsx',
 'parallel_corpus_vg_en_it_6.xlsx',
 'parallel_corpus_vg_en_it_10.xlsx',
 'parallel_corpus_vg_en_it_7.xlsx',
 'parallel_corpus_vg_en_it_4.xlsx',
 'parallel_corpus_vg_en_it_1.xlsx',
 'parallel_corpus_vg_en_it_8.xlsx',
 'parallel_corpus_vg_en_it_5.xlsx',
 'sample_data']

In [4]:
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
files_xlsx

['parallel_corpus_vg_en_it_9.xlsx',
 'parallel_corpus_vg_en_it_11.xlsx',
 'parallel_corpus_vg_en_it_2.xlsx',
 'parallel_corpus_vg_en_it_3.xlsx',
 'parallel_corpus_vg_en_it_12.xlsx',
 'parallel_corpus_vg_en_it_6.xlsx',
 'parallel_corpus_vg_en_it_10.xlsx',
 'parallel_corpus_vg_en_it_7.xlsx',
 'parallel_corpus_vg_en_it_4.xlsx',
 'parallel_corpus_vg_en_it_1.xlsx',
 'parallel_corpus_vg_en_it_8.xlsx',
 'parallel_corpus_vg_en_it_5.xlsx']

In [5]:
df = pd.DataFrame()

In [6]:
for f in files_xlsx:
    data = pd.read_excel(f, index_col=0)
    df = df.append(data)

In [7]:
df = df.drop_duplicates()

In [8]:
df = df.reset_index(drop=True)

In [9]:
df['en'] = df['en'].apply(lambda x: re.sub('\n', ' ', str(x))) # Replacing newlines with whitespaces
df['it'] = df['it'].apply(lambda x: re.sub('\n', ' ', str(x)))

In [10]:
df['en'] = df['en'].apply(lambda x: re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ\/\-.,;:"\']+', '', str(x))) # Remove special characters but not accented letters
df['it'] = df['it'].apply(lambda x: re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ\/\-.,;:"\']+', '', str(x)))

In [11]:
# Tokenizing the texts
df['tokenized_text_en'] = df['en'].apply(word_tokenize) 
df['token_count_en'] = df['tokenized_text_en'].apply(lambda x: len(x))

df['tokenized_text_it'] = df['it'].apply(word_tokenize) 
df['token_count_it'] = df['tokenized_text_it'].apply(lambda x: len(x))

In [12]:
# Removing sentences shorter or equal to six tokens
df = df[(df['token_count_en'] >= 6)]

In [14]:
# Removing segments with a token difference larger than 10
df['difference'] = df["token_count_en"] - df["token_count_it"]
df['difference'] = df['difference'].abs()
df2 = df[(df['difference'] > 10)] 
df = df[(df['difference'] < 10)]

In [17]:
df.to_excel("parallel_corpus.xlsx")  

In [15]:
df

Unnamed: 0,en,it,tokenized_text_en,token_count_en,tokenized_text_it,token_count_it,difference
0,"Chris Charla of NextGen said, ""As much as we l...","Chris Charla di NextGen ha dichiarato: ""Per qu...","[Chris, Charla, of, NextGen, said, ,, ``, As, ...",31,"[Chris, Charla, di, NextGen, ha, dichiarato, :...",30,1
1,If you can find a few copies in the bargain bi...,Se riesci a trovare alcune copie nel cestino d...,"[If, you, can, find, a, few, copies, in, the, ...",22,"[Se, riesci, a, trovare, alcune, copie, nel, c...",21,1
2,"Aerobiz , ""Air Management: zora ni Kakeru"" is...","""Air Management: zora ni Kakeru"" è un videogio...","[Aerobiz, ,, ``, Air, Management, :, zora, ni,...",32,"[``, Air, Management, :, zora, ni, Kakeru, '',...",28,4
3,Another sequel known as Air Management '96 was...,"Air Management '96, il terzo sequel per il sol...","[Another, sequel, known, as, Air, Management, ...",19,"[Air, Management, '96, ,, il, terzo, sequel, p...",21,2
6,It revolves around building and maintaining an...,Il gioco ruota intorno alla costruzione e allo...,"[It, revolves, around, building, and, maintain...",16,"[Il, gioco, ruota, intorno, alla, costruzione,...",21,5
...,...,...,...,...,...,...,...
9918,World of Warplanes WoWp is a free-to-play aeri...,World of Warplanes è un videogioco free-to pla...,"[World, of, Warplanes, WoWp, is, a, free-to-pl...",23,"[World, of, Warplanes, è, un, videogioco, free...",30,7
9919,The game was originally released in November 2...,ll gioco venne pubblicato nel novembre 2013 ne...,"[The, game, was, originally, released, in, Nov...",17,"[ll, gioco, venne, pubblicato, nel, novembre, ...",19,2
9920,"Wurm first started its Beta stage in 2003, and...",Nonostante il gioco fu ufficialmente commercia...,"[Wurm, first, started, its, Beta, stage, in, 2...",17,"[Nonostante, il, gioco, fu, ufficialmente, com...",20,3
9921,The distant landscapes were highlighted for th...,I paesaggi sono stati evidenziati per la loro ...,"[The, distant, landscapes, were, highlighted, ...",23,"[I, paesaggi, sono, stati, evidenziati, per, l...",24,1
