In [1]:
# Prepare workspace
import pandas as pd

In [2]:
# Import IME words database
ime_database = pd.read_csv("resources\\palavras_ime.txt", header=None, names=["words"])
ime_database.head()

Unnamed: 0,words
0,a
1,Aarao
2,aba
3,abacate
4,abacateiro


In [3]:
# Import python database
python_database = pd.read_csv("resources\\palavras_python_br.txt", header=None, names=["words"])
python_database.head()

Unnamed: 0,words
0,a
1,ª
2,à
3,á
4,ã


In [4]:
# Compare number of entries in both databases
print("Comparison between number of entries in each database")
print(f"Python.pro.br: {len(python_database)}")
print(f"IME: {len(ime_database)}")

Comparison between number of entries in each database
Python.pro.br: 320139
IME: 245366


In [5]:
# Filter the words with 5 letters
ime_database = ime_database[ime_database["words"].str.len()==5]

# Make all words lower case
ime_database["words"] = ime_database["words"].str.lower()

ime_database.head()

Unnamed: 0,words
1,aarao
11,abaco
13,abade
19,abafa
69,abafe


In [6]:
# Filter the words with 5 letters
python_database = python_database[python_database["words"].str.len()==5]

# Make all words lower case
python_database["words"] = python_database["words"].str.lower()

python_database.head()

Unnamed: 0,words
10,à-toa
18,aaiún
25,aarão
57,ababa
64,ababé


In [7]:
# Remove words with hyphen
python_database = python_database.drop(python_database[python_database["words"].str.contains("-")].index)

# Replace accented characters for not accented ones
python_database["words"] = python_database["words"].str.replace("[áÁâÂãÃàÀ]","a", regex=True)
python_database["words"] = python_database["words"].str.replace("[éÉêÊ]","e", regex=True)
python_database["words"] = python_database["words"].str.replace("[íÍ]","i", regex=True)
python_database["words"] = python_database["words"].str.replace("[óÓôÔõÕ]","o", regex=True)
python_database["words"] = python_database["words"].str.replace("[úÚüÜ]","u", regex=True)
python_database["words"] = python_database["words"].str.replace("ç","c")

python_database.head()

Unnamed: 0,words
18,aaiun
25,aarao
57,ababa
64,ababe
77,abaca


In [8]:
# Remove duplicated entries
python_database.drop_duplicates(inplace=True)
ime_database.drop_duplicates(inplace=True)

In [9]:
python_database.head()

Unnamed: 0,words
18,aaiun
25,aarao
57,ababa
64,ababe
77,abaca


In [10]:
ime_database.head()

Unnamed: 0,words
1,aarao
11,abaco
13,abade
19,abafa
69,abafe


In [11]:
# Compare number of entries in both databases
print("Comparison between number of entries in each database after removing unwanted entries")
print(f"Python.pro.br: {len(python_database)}")
print(f"IME: {len(ime_database)}")

Comparison between number of entries in each database after removing unwanted entries
Python.pro.br: 10418
IME: 5427
