In [93]:
import pandas as pd
from words_dataset_treatment import treat_dataset
from collections import Counter
from string import ascii_lowercase

from IPython.core.interactiveshell import InteractiveShell

In [94]:
# Import words dataset
words_dataset = pd.read_csv("resources\\palavras_python_br.txt", header=None, names=["words"])
words_dataset.head()

Unnamed: 0,words
0,a
1,ª
2,à
3,á
4,ã


In [95]:
# Treat dataset
words_dataset = treat_dataset(words_dataset)

# Preview dataset
words_dataset.head()

Unnamed: 0,words
18,aaiun
25,aarao
57,ababa
64,ababe
77,abaca


In [96]:
# Display size of the dataset
number_of_words = len(words_dataset)
print(f"Number of words in the dataset: {number_of_words}")

Number of words in the dataset: 9974


In [97]:
words_dataset[words_dataset['words'].str.contains('ª', regex = False)]

Unnamed: 0,words


In [98]:
# Count the number of words in which each letter appears at least once
occurence_of_letter_at_least_once = {letter: 0 for letter in ascii_lowercase}
occurence_of_letter_at_least_once = pd.Series(occurence_of_letter_at_least_once)

for letter in occurence_of_letter_at_least_once.index:
    for word in words_dataset['words']:
        if letter in word:
            occurence_of_letter_at_least_once[letter] += 1

occurence_of_letter_at_least_once.sort_values(ascending=False, inplace=True)
print(occurence_of_letter_at_least_once.to_string())

a    6132
o    4399
i    4048
e    3505
r    3210
u    2532
l    2444
c    2137
s    2106
m    2002
n    1902
t    1744
b    1361
d    1268
p    1244
g    1099
v     759
f     725
h     571
x     556
j     468
z     416
q     148
y      27
k      23
w       8


In [100]:
# Count the occurences of each letter for each position in the words 
occurence_of_letters_in_each_position = pd.DataFrame([Counter(word) for word in words_dataset['words'].str]).fillna(0)

print(occurence_of_letters_in_each_position.to_string())


      a    b    c    d     e    f    g    h     i    j   k    l    m    n     o    p     q    r    s    t     u    v    w    x   y    z
0  1022  635  949  375   318  437  402  243   336  257  12  475  880  364   285  719  64.0  395  553  600   202  269  3.0   75   2  102
1  2098  134  277  168  1262   53   93  107  1275   26   1  385  172  242  1351   96  12.0  607  131  131  1136   66  2.0  101   5   43
2   841  336  606  351   535  163  362   39   908   93   2  730  461  853   408  250  72.0  971  452  505   516  189  2.0  223   8   98
3  1696  324  506  392   804   80  261  179  1309   93   6  373  294  406   674  210   0.0  623  286  576   486  168  1.0  162   2   63
4  2257    8   17   12  1100    6    7    5   671    5   3  556  335  108  2380   13   0.0  863  904   19   378   85  0.0  117  10  115


  occurence_of_letters_in_each_position = pd.DataFrame([Counter(word) for word in words_dataset['words'].str]).fillna(0)


In [101]:
percentage_occurence_of_letters_in_each_position = round(occurence_of_letters_in_each_position/number_of_words*100, 1)

print(percentage_occurence_of_letters_in_each_position.to_string())

      a    b    c    d     e    f    g    h     i    j    k    l    m    n     o    p    q    r    s    t     u    v    w    x    y    z
0  10.2  6.4  9.5  3.8   3.2  4.4  4.0  2.4   3.4  2.6  0.1  4.8  8.8  3.6   2.9  7.2  0.6  4.0  5.5  6.0   2.0  2.7  0.0  0.8  0.0  1.0
1  21.0  1.3  2.8  1.7  12.7  0.5  0.9  1.1  12.8  0.3  0.0  3.9  1.7  2.4  13.5  1.0  0.1  6.1  1.3  1.3  11.4  0.7  0.0  1.0  0.1  0.4
2   8.4  3.4  6.1  3.5   5.4  1.6  3.6  0.4   9.1  0.9  0.0  7.3  4.6  8.6   4.1  2.5  0.7  9.7  4.5  5.1   5.2  1.9  0.0  2.2  0.1  1.0
3  17.0  3.2  5.1  3.9   8.1  0.8  2.6  1.8  13.1  0.9  0.1  3.7  2.9  4.1   6.8  2.1  0.0  6.2  2.9  5.8   4.9  1.7  0.0  1.6  0.0  0.6
4  22.6  0.1  0.2  0.1  11.0  0.1  0.1  0.1   6.7  0.1  0.0  5.6  3.4  1.1  23.9  0.1  0.0  8.7  9.1  0.2   3.8  0.9  0.0  1.2  0.1  1.2


In [102]:
# Count the occurences of each letter in the words 
occurence_of_letters = occurence_of_letters_in_each_position.sum(axis='index')
occurence_of_letters.sort_values(ascending=False, inplace=True)

print(occurence_of_letters.to_string())

a    7914.0
o    5098.0
i    4499.0
e    4019.0
r    3459.0
u    2718.0
l    2519.0
c    2355.0
s    2326.0
m    2142.0
n    1973.0
t    1831.0
b    1437.0
d    1298.0
p    1288.0
g    1125.0
v     777.0
f     739.0
x     678.0
h     573.0
j     474.0
z     421.0
q     148.0
y      27.0
k      24.0
w       8.0


In [103]:
possible_words = []
desired_letters = ['r', 'e', 'i', 'u', 'a']
for word in words_dataset['words']:
    word_is_accepted = True
    for letter in desired_letters:
        if letter not in word:
            word_is_accepted = False
    
    if word_is_accepted:    
        possible_words.append(word)

print(possible_words)        

['euria', 'ureia']
