### Collecting Data

Collects data from wikipedia pages, saves them and extract unique words and their's frequencies to uses on word structure analysis. It also checks if the language corpora it's already collected to skip calling the API multiple times.

In [1]:
# Import Libs
import wikipedia
import re
import pandas as pd
from collections import Counter
import time
from pathlib import Path

In [2]:
# Set up directory and file paths
corpus_dir = Path("language_corpora")
corpus_dir.mkdir(exist_ok=True)
corpus_file = corpus_dir / "portuguese_corpus.txt"

# Set Wikipedia language to Portuguese
wikipedia.set_lang("pt")

# List of common Portuguese Wikipedia page topics
search_terms = ["Brasil", "Econômia", "Futebol", "História", "Cultura"]

# Collect text from Wikipedia pages
def collect_wikipedia_text(search_terms, results_limit=2):
    collected_texts = []
    total_words = 0
    min_content_length = 500
    target_words = 20000

    for term in search_terms:
        if total_words >= target_words:
            break
        try:
            search_results = wikipedia.search(term, results=results_limit)
            for title in search_results:
                if total_words >= target_words:
                    break
                try:
                    page = wikipedia.page(title, auto_suggest=False)
                    content = page.content
                    word_count = len(content.split())
                    if word_count > min_content_length:
                        collected_texts.append(content)
                        total_words += word_count
                    time.sleep(1)
                except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
                    pass
                except Exception:
                    pass
        except Exception:
            pass

    return '\n'.join(collected_texts)

# Check if corpus exists and has sufficient words
min_words = 20000
if corpus_file.exists():
    with open(corpus_file, 'r', encoding='utf-8') as f:
        text = f.read()
    word_count = len(text.split())
    if word_count >= min_words:
        pass  # Use existing corpus
    else:
        text = collect_wikipedia_text(search_terms)
        if text and len(text.split()) >= min_words:
            with open(corpus_file, 'w', encoding='utf-8') as f:
                f.write(text)
else:
    text = collect_wikipedia_text(search_terms)
    if text and len(text.split()) >= min_words:
        with open(corpus_file, 'w', encoding='utf-8') as f:
            f.write(text)

# Check if text was retrieved or loaded
if not text:
    exit()

# Clean the text: keep letters and diacritics, remove numbers, punctuation, spaces, etc.
words = re.findall(r'[a-záéíóúâêîôûãõç]+', text.lower())

# Count word frequencies
word_counts = Counter(words)

# Create DataFrame with words and their frequencies
df = pd.DataFrame(list(word_counts.items()), columns=['Palavra', 'Frequencia'])

# Sort by frequency and sample 10 random words
df = df.sort_values(by='Frequencia', ascending=False).sample(10)

#### Encontrando Sílabas

In [3]:
# Cria uma lista de vogais
vogais = ['a', 'e', 'i', 'o', 'u']

# Cria uma lista de consoantes que aceitam CA
consoantes_ca = ['b', 'c', 'd', 'f', 'g', 'p', 't']

# Cria uma lista de consoantes auxiliares (CA)
consoantes_auxiliares = ['l', 'r']

# Cria uma lista de consoantes de coda (CC)
consoantes_coda = ['m', 'n', 'r', 'l', 'z', 's', 'x']
codas = consoantes_coda

# Cria uma lista de consoantes que não aceitam CA
consoantes_nao_ca = [
    'h', 'j', 'm', 'n', 'r','l', 's', 'v', 'x', 'z', 'lh', 'nh', 'qu', 'gu', 'cl',
    'ch']

In [4]:
def gerar_silabas(vogais, consoantes_ca, consoantes_auxiliares, consoantes_coda, consoantes_nao_ca):
    silabas = []
    
    # Using only vogais
    for vogal in vogais:
        silabas.append(vogal)
        for cc in consoantes_coda:
            silabas.append(vogal + cc)
            if cc == 'n':
                silabas.append(vogal + cc + 's')
                
    # Using consoantes_ca
    for consoante in consoantes_ca:
        for vogal in vogais:
            silabas.append(consoante + vogal)
            for cc in consoantes_coda:
                silabas.append(consoante + vogal + cc)
                if cc == 'n':
                    silabas.append(consoante + vogal + cc + 's')
                    
    # Using consoantes_auxiliares
    for consoante in consoantes_auxiliares:
        for vogal in vogais:
            silabas.append(consoante + vogal)
            for cc in consoantes_coda:
                silabas.append(consoante + vogal + cc)
                if cc == 'n':
                    silabas.append(consoante + vogal + cc + 's')
                    
    # Using consoantes_nao_ca
    for consoante in consoantes_nao_ca:
        for vogal in vogais:
            silabas.append(consoante + vogal)
            for cc in consoantes_coda:
                silabas.append(consoante + vogal + cc)
                if cc == 'n':
                    silabas.append(consoante + vogal + cc + 's')
    
    return silabas

# Chama a função com os parâmetros especificados
resultado = gerar_silabas(vogais, consoantes_ca, consoantes_auxiliares, consoantes_coda, consoantes_nao_ca)

# Converte em um dataframe
silabas = pd.DataFrame(resultado, columns=['silabas'])

silabas

Unnamed: 0,silabas
0,a
1,am
2,an
3,ans
4,ar
...,...
1165,chur
1166,chul
1167,chuz
1168,chus


In [5]:
import chime

In [6]:
def match_syllable_from_start(word, patterns, next_char_cond=None):
    """Return the longest syllable from the start of the word that matches a pattern."""
    max_syllable = ''
    for pattern in patterns:
        if word.startswith(pattern) and len(pattern) > len(max_syllable):
            # If there's a condition to check the character after the matched pattern
            next_char_index = len(pattern)
            if next_char_cond:
                # Either the word ends after the coda, or the next character satisfies the condition
                if next_char_index == len(word) or (next_char_index < len(word) and next_char_cond(word[next_char_index])):
                    max_syllable = pattern
            else:
                max_syllable = pattern
    return max_syllable

In [7]:
def identify_syllables_in_word(word, vogais, consoantes_ca, consoantes_auxiliares, consoantes_coda, consoantes_nao_ca):
    identified_syllables = []
    i = 0

    # Patterns list comprehension
    ca_vc_coda = [con + ca + v + co for con in consoantes_ca for ca in consoantes_auxiliares for v in vogais for co in consoantes_coda]
    c_v_coda = [con + v + co for con in (consoantes_ca + consoantes_nao_ca) for v in vogais for co in consoantes_coda]
    ca_v = [con + ca + v for con in consoantes_ca for ca in consoantes_auxiliares for v in vogais]
    c_v = [con + v for con in (consoantes_ca + consoantes_nao_ca) for v in vogais]
    v_coda = [v + co for v in vogais for co in consoantes_coda]
    
    while i < len(word):
        patterns = [ca_vc_coda, c_v_coda, ca_v, c_v, v_coda, vogais]
        
        matched = False
        for pattern_list in patterns:
            syllable = match_syllable_from_start(word[i:], pattern_list)
            
            # For codas, check if it should be retained as a coda or act as the starting consonant of the next syllable
            if syllable and syllable[-1] in consoantes_coda:
                if i + len(syllable) < len(word) and word[i + len(syllable)] in vogais:
                    # If coda is followed by a vowel, then treat it as the beginning of the next syllable
                    syllable = syllable[:-1]
            
            if syllable:
                identified_syllables.append(syllable)
                i += len(syllable)
                matched = True
                break

        if not matched:
            i += 1  # If no syllable match, just move to the next character

    return identified_syllables

df['Identified_Syllables'] = df['Palavra'].apply(lambda word: identify_syllables_in_word(word, vogais, consoantes_ca, consoantes_auxiliares, consoantes_coda, consoantes_nao_ca))

df

Unnamed: 0,Palavra,Frequencia,Identified_Syllables
3173,típicos,1,"[pi, cos]"
2902,jabor,1,"[ja, bor]"
5375,arbitrariamente,1,"[ar, bi, tra, ri, a, men, te]"
3005,almeida,1,"[al, me, i, da]"
3858,tratar,2,"[tra, tar]"
3569,desmembrado,1,"[des, mem, bra, do]"
1311,eleita,1,"[e, le, i, ta]"
1278,aprovaram,1,"[a, pro, va, ram]"
2129,carcerária,1,"[car, cer, ri, a]"
212,lisboa,4,"[lis, bo, a]"


In [8]:
chime.success()

In [9]:
#word = str(input('Digite a palavra: ')).lower()
word0 = word.removesuffix('s')
word1 = word0[:-2]
word2 = word0[:-3]

# Listas de Terminações
vogox = ['á', 'é', 'ê', 'i', 'í', 'ó', 'ô', 'u', 'ú', 'ã',
         'ão', 'õe', 'ãe', 'ém']
conox = ['r', 'l', 'z', 'x', 'om', 'im', 'um']
vogpro = ['á', 'â', 'é', 'ê', 'í', 'ó', 'ô', 'ú']
vogsim = ['a', 'e', 'o']
excecao = ['ã']

# Grupos de Palavras

grupo_a = word0.endswith(tuple(vogox)) #Termina em elementos oxitonos
grupo_b = word0.endswith(tuple(conox)) #Termina em consoante
grupo_c = bool(set(vogpro) & set(word0)) #Contém vogal acentuada
grupo_d = word0.endswith(tuple(vogsim)) #Termina em vogal não-acentuada
grupo_e = bool(set(vogpro) & set(word1)) #Contém vogal acentuada (não terminal -2)
grupo_f = bool(set(excecao) & set(word0)) #Contém ã
grupo_g = bool(set(vogpro) & set(word2)) #Contém vogal acentuada (não terminal -3)

# Respostas - Vogais
if (grupo_c and grupo_d and grupo_g == True) and grupo_f == False:
    print('Essa palavra é proparoxítona')
elif grupo_a == True and grupo_e == False:
    print('Essa palavra é oxítona')

# Respostas - Consoantes

elif grupo_b == True and grupo_c == False:
    print('Essa palavra é oxítona')

elif grupo_b and grupo_c == True:
    print('Essa palavra é paroxítona')

else:
    print('Essa palavra é paroxítona')

NameError: name 'word' is not defined