In [1]:
# Install necessary packages
!pip install fasttext nltk langcodes

# Download the FastText language identification model (lid.176.bin)
!wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313472 sha256=bb900d05fbb3612518ba70da0306b833cd25fc91dd0aaff60841f3e1b41a0565
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [4]:
#
import os
import fasttext
import nltk
from nltk.tokenize import TweetTokenizer
import collections
import string
from langcodes import *

# Download NLTK's Punkt tokenizer models
nltk.download('punkt')
nltk.download('punkt_tab')

# Mount Google Drive to access your text files
from google.colab import drive
drive.mount('/content/drive')

# Set the path to your folder in Google Drive (update the path as needed)
folder_path = '/content/drive/MyDrive/Research/BID-GuaranIA/TXT COREGUAPA'

# Read all .txt files from the folder and combine their contents
all_text = ""
file_names = []
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            all_text += text + "\n"
            file_names.append(filename)

print("Processed files:", file_names)
print("Total Processed files:", len(file_names))
# Tokenize the combined text into sentences and words.
# Using the Spanish sentence tokenizer since punctuation is similar;
# adjust if necessary for your specific text.
sentences = nltk.sent_tokenize(all_text, language='spanish')

def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text
all_text = clean_text(all_text)

tokens_a = nltk.word_tokenize(all_text)
tt = TweetTokenizer()
tokens_b = tt.tokenize(all_text)
tokens_c = all_text.replace('\n', ' ').strip().split()

total_tokens_a = len(tokens_a)
total_tokens_b = len(tokens_b)
total_tokens_c = len(tokens_c)
total_sentences = len(sentences)

print(f"Total tokens method A: {total_tokens_a}")
print(f"Total tokens method B: {total_tokens_b}")
print(f"Total tokens method C: {total_tokens_c}")
print(f"Total sentences: {total_sentences}")

# Compute additional statistics
token_freq = collections.Counter(tokens_c)
vocab_size = len(token_freq)
avg_sentence_length = total_tokens_c / total_sentences if total_sentences > 0 else 0

print(f"Vocabulary size (unique tokens): {vocab_size}")
print(f"Average sentence length (in tokens): {avg_sentence_length:.2f}")

# Load FastText language identification model
ft_model = fasttext.load_model('lid.176.bin')

# Initialize counters for language-based word counts
spanish_word_count = 0
guarani_word_count = 0
other_word_count = {}
spanish_sentence_count = 0
guarani_sentence_count = 0
other_sentence_count = {}

def return_gn(ft_prediction):
    ft_prediction = ft_prediction[0]
    # Example label format: '__label__es' for Spanish.
    if '__label__gn' in ft_prediction:
        return 'gn'
    elif '__label__es' in ft_prediction:
        return 'es'
    else:
        return None

# Process each sentence:
# - Predict the language using FastText.
# - Tokenize the sentence into words.
# - Aggregate counts based on the predicted language.
for sentence in sentences:
    # Clean sentence (remove extra newlines/spaces)
    sentence_clean = clean_text(sentence).replace('\n', ' ').strip()
    if not sentence_clean:
        continue
    # FastText returns a tuple with predicted label(s) and probabilities.
    prediction = ft_model.predict(sentence_clean, k=5)

    lang_code = return_gn(prediction)
    #words_in_sentence = nltk.word_tokenize(sentence_clean)
    #words_in_sentence = tt.tokenize(sentence_clean)
    words_in_sentence = sentence_clean.split()
    if lang_code == 'es':
        spanish_word_count += len(words_in_sentence)
        spanish_sentence_count += 1
    elif lang_code == 'gn':  # Assuming 'gn' is returned for Guarani
        guarani_word_count += len(words_in_sentence)
        guarani_sentence_count += 1
    else:
        lang_label = prediction[0][0]
        lang_code = lang_label.replace('__label__', '')
        other_word_count[lang_code] = other_word_count.get(lang_code, 0) + len(words_in_sentence)
        other_sentence_count[lang_code] = other_sentence_count.get(lang_code, 0) + 1

print("\nLanguage-based token counts (by sentence prediction):")
print(f"Spanish -> tokens: {spanish_word_count}, sentences: {spanish_sentence_count}")
print(f"Guarani -> tokens: {guarani_word_count}, sentences: {guarani_sentence_count}")
print(f"Other/unknown languages -> tokens: {sum(other_word_count.values())}, sentences: {sum(other_sentence_count.values())}")
print("Top 10 most common other languages:")
for language, freq in  collections.Counter(other_sentence_count).most_common(10):
    lang_name =  Language.make(language=language).display_name()
    print(f"{lang_name}: {freq}")

# Additional insight: Display the 20 most common tokens
print("\nTop 10 most common tokens:")
for token, freq in token_freq.most_common(10):
    print(f"{token}: {freq}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed files: ['Paraguái Léi Guasu ary 1992.txt', 'LÉI PY 5446 TETÃ REMBIAPORÃITE KUÑA OKARAYGUÁPE G̃UARÃ.txt', 'TEMBIAPOUKAPY PY 2991 MBOJOAPY.txt', 'LÉI Ppy 6530.txt', 'LÉI PY 5777.txt', 'LÉI PY 5016-14.txt', 'LÉI PAPAPY 1334-98.txt', 'Léi 4251-10 Ñe’enguéra Rehegua.txt', 'PARAGUÁI CÓDIGO ELECTORAL.txt', 'Ñañangareko hag̃ua ñande rekoha rehe Ciencias Naturales 6º Grado EEB.txt', 'Ñane ñe’ẽtee Lengua Materna 5° Grado EEB.txt', 'Ñandekatupyry hag̃ua papapykuérape Matematica 6º Grado EEB.txt', 'Jaguerojera hag̃ua mba’e porã Educación Artística 7º Grado EEB.txt', 'Ñañangareko porãve hag̃ua ñande rete rehe Educación Física 7º grado EEB.txt', 'Ñañangareko porãve hag̃ua ñande rete rehe Educación Física 7º grado EEB (1).txt', 'Teratee ojeporúva Paraguái Retäme.txt', 'Jaguerojera hag̃ua mba’e porã Educ. Artística 6º Gr

In [3]:
# '/content/drive/MyDrive/Research/BID-GuaranIA/TXT COREGUAPA'