In [86]:
import whisper
import os
import ssl
import certifi

ssl_context = ssl.create_default_context(cafile=certifi.where())

def transcribe_audio(file_path, model_size="medium", language="es"):
    model = whisper.load_model(model_size)
    result = model.transcribe(file_path, language=language)
    return result["text"]

def save_transcript(text, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)

if __name__ == "__main__":
    audio_path = input("–ü—É—Ç—å –∫ –ª–æ–∫–∞–ª—å–Ω–æ–º—É –∞—É–¥–∏–æ—Ñ–∞–π–ª—É (.mp3, .wav –∏ –¥—Ä.): ").strip()

    if not os.path.isfile(audio_path):
        print(f"–§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω: {audio_path}")
        exit(1)

    print("üß† –†–∞—Å–ø–æ–∑–Ω–∞—ë–º —Ç–µ–∫—Å—Ç (whisper)...")
    transcript = transcribe_audio(audio_path, model_size="medium", language="es")

    output_file = audio_path.rsplit(".", 1)[0] + ".txt"
    save_transcript(transcript, output_file)

    print(f"\n‚úÖ –ì–æ—Ç–æ–≤–æ! –¢—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ {output_file}")
    print("üîé –ü–µ—Ä–≤—ã–µ 300 —Å–∏–º–≤–æ–ª–æ–≤:\n")
    print(transcript[:300])

üß† –†–∞—Å–ø–æ–∑–Ω–∞—ë–º —Ç–µ–∫—Å—Ç (whisper)...





‚úÖ –ì–æ—Ç–æ–≤–æ! –¢—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ /Users/elizaveta/Documents/–ü—Ä–æ–µ–∫—Ç_2/7septimo.txt
üîé –ü–µ—Ä–≤—ã–µ 300 —Å–∏–º–≤–æ–ª–æ–≤:

 Esta es la historia de Lola y Ana, quien comparten un piso en Barcelona, y de sus vecinos de en frente Pablo y Sam el americano. Lola quiere ser rica, Pablo quiere ser actor y Sam quiere invitar a Ana al cine. Prep√°rate para el pr√≥ximo episodio de Extra. No puedo creer que vuelvas de Argentina. ¬øRe


In [3]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk import word_tokenize, bigrams

nlp = spacy.load("es_core_news_sm")

def load_and_split_sentences(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()], text

def extract_pos_filtered_words(text, allowed_pos={"NOUN", "VERB", "ADJ", "ADV"}):
    doc = nlp(text)
    return [token.text.lower() for token in doc 
            if token.pos_ in allowed_pos and not token.is_stop and token.is_alpha]

def extract_keywords_by_pos(text, allowed_pos={"NOUN", "VERB", "ADJ", "ADV", "PREP"}, top_n=20):
    words = extract_pos_filtered_words(text, allowed_pos)
    filtered_text = " ".join(words)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([filtered_text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_keywords = sorted(scores, key=lambda x: x[1], reverse=True)
    
    return [word for word, _ in sorted_keywords[:top_n]]

def extract_top_bigrams(text, top_n=20, exclude_words=None):
    if exclude_words is None:
        exclude_words = set()

    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]  # –¢–æ–ª—å–∫–æ –∞–ª—Ñ–∞–≤–∏—Ç–Ω—ã–µ —Å–ª–æ–≤–∞

    bigram_list = list(bigrams(tokens))

    # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –±–∏–≥—Ä–∞–º, —Å–æ–¥–µ—Ä–∂–∞—â–∏—Ö –∏—Å–∫–ª—é—á—ë–Ω–Ω—ã–µ —Å–ª–æ–≤–∞
    filtered_bigrams = [
        pair for pair in bigram_list
        if pair[0] not in exclude_words and pair[1] not in exclude_words
    ]

    bigram_freq = Counter(filtered_bigrams)
    top_bigrams = [" ".join(pair) for pair, _ in bigram_freq.most_common(top_n)]
    return top_bigrams


# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–µ–∫—Å—Ç–∞
sentences, text = load_and_split_sentences("1primero.txt")

# –ö–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –ø–æ POS
keywords = extract_keywords_by_pos(text, top_n=50)

# –ß–∞—Å—Ç–æ—Ç–Ω—ã–µ –±–∏–≥—Ä–∞–º–º—ã
exclude_names = {"ana", "lola", "sam", "pablo", "antonio"}
bigrams = extract_top_bigrams(text, top_n=50, exclude_words=exclude_names)

# –î–ª—è –≤—ã–≤–æ–¥–∞:
print("–ö–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ (TF-IDF –ø–æ NOUN/VERB/ADJ/ADV):")
print(keywords)

print("\n–ß–∞—Å—Ç–æ—Ç–Ω—ã–µ –±–∏–≥—Ä–∞–º–º—ã:")
print(bigrams)

–ö–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ (TF-IDF –ø–æ NOUN/VERB/ADJ/ADV):
['horno', 'espa√±ol', 'quieres', 'llamo', 'perro', 'museo', 'americano', 'coches', 'chicas', 'vivo', 'correo', 'dormitorio', 'ducha', 'estupendo', 'est√°s', 'quedar', 'tienes', 'cama', 'carta', 'chica', 'chico', 'compras', 'dije', 'duerme', 'factura', 'gracias', 'hablar', 'queda', 'r√°pido', 'tomar', 'abajo', 'acab√≥', 'amigos', 'a√±os', 'ba√±o', 'bici', 'bicicleta', 'celebrarlo', 'corresponsal', 'dices', 'digas', 'dormir', 'echo', 'favor', 'febrero', 'fuertes', 'gusta', 'hablo', 'hombres', 'importa']

–ß–∞—Å—Ç–æ—Ç–Ω—ã–µ –±–∏–≥—Ä–∞–º–º—ã:
['el horno', 'en el', 'el perro', 'me llamo', 'un museo', 'est√° en', 'de am√©rica', 'no no', 'en un', 'perro est√°', 'esto es', 'muy bien', 'es la', 'el correo', 'la factura', 'bien el', 'tomar algo', 'es un', 'quiere decir', 'con coches', 'es el', 'se vaya', 'un poco', 'perro de', 'duerme en', 'de compras', 'compras para', 'en espa√±a', 'le gusta', 'est√°n las', 'una carta', 'te lo', 'se a