In [None]:
# install SpaCy
# !pip install spacy
# download the model es_dep_news_trf.
!python3 -m spacy download es_dep_news_trf

In [None]:
'''
Q1 & Q2
Install SpaCy and let it download the model es_dep_news_trf.
Create a new Jupyter notebook and import spacy. 
Load the contents of the file azuela1920_los-de-abajo.txt into a single string, and run the model on it. 
Extract the sentence spans, and store their contents into a text file.
'''

# import spacy
import spacy

# Load the Spanish transformer-based model
nlp = spacy.load("es_dep_news_trf")

# Load the contents of the file into a single string
file_path = "/Users/ychen/DS_Assign/azuela1920_los-de-abajo.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Run model on it
doc = nlp(text)

# Extract sentence spans
sentence_spans = list(doc.sents)

# Store sentence contents in a text file
with open("sentence_spans.txt", "w", encoding="utf-8") as output_file:
    for span in sentence_spans:
        output_file.write(span.text + "\n")

In [None]:
'''
Q3
Repeat the process, but this time, replace all newline characters in the text by a space before running it through the pipeline. 
Store the sentence spans in a file again, and investigate the differences. 
What do you notice? Use machine translation in case you don’t understand enough. 
For the following tasks, we will stick to the version without newlines.
'''

# Read the contents of the file into a string and replace newlines with spaces before running it through the pipeline
with open(file_path, "r", encoding="utf-8") as file:
    text_modified = file.read().replace("\n", " ")

# Process the modified text with SpaCy
doc_modified = nlp(text_modified)

# Extract sentence spans again
sentence_spans_modified = list(doc_modified.sents)

# Store the modified new sentence contents in a new text file
with open("sentence_spans_modified.txt", "w", encoding="utf-8") as output_file:
    for span_modified in sentence_spans_modified:
        output_file.write(span_modified.text + "\n")
'''
Differences:
'''

In [None]:
'''
Q4
Familiarise yourself with the different properties of the Token object, 
and extract all pairs of full verbs (UD tag VERB) and their subjects (relation nsubj) in lemmatised form, 
storing them for later processing. In my solution, the first pair is (’decir’,’parte’) (which is actually wrong).
'''
# Initialize a list to store verb-subject pairs
verb_subject_pairs = []

# Iterate over the tokens in the processed document
for token in doc:
    # Check if the token is a verb and has a subject
    if token.pos_ == "VERB" and "nsubj" in [child.dep_ for child in token.children]:
        # Get the lemmatized form of the verb and its subject
        verb_lemma = token.lemma_
        subject = [child.text for child in token.children if child.dep_ == "nsubj"]

        # Append the verb-subject pair to the list
        verb_subject_pairs.append((verb_lemma, subject))

# Print the first few pairs for verification
print(verb_subject_pairs[:5])

# Store the verb-subject pairs in a file for later processing
with open("verb_subject_pairs.txt", "w", encoding="utf-8") as output_file:
    for pair in verb_subject_pairs:
        output_file.write(f"{pair[0]}, {', '.join(pair[1])}\n")

In [None]:
'''
Q5
Extract the ten most common verbs occurring in your pairs, 
and the three most common subjects for the following verbs: 
gritar “to shout”, preguntar “to ask”, and responder “to answer”. 
Who shouts the most, who asks the most questions, and who appears to answer them?
'''
from collections import Counter

# Extract all verbs from the verb-subject pairs
all_verbs = [pair[0] for pair in verb_subject_pairs]

# Get the ten most common verbs
common_verbs = [verb for verb, _ in Counter(all_verbs).most_common(10)]

# Initialize dictionaries to store subjects for specific verbs
subjects_for_gritar = Counter()
subjects_for_preguntar = Counter()
subjects_for_responder = Counter()

# Iterate over the verb-subject pairs
for verb, subjects in verb_subject_pairs:
    if verb in common_verbs:
        # Update subject counters for the specific verbs
        if verb == "gritar":
            subjects_for_gritar.update(subjects)
        elif verb == "preguntar":
            subjects_for_preguntar.update(subjects)
        elif verb == "responder":
            subjects_for_responder.update(subjects)

# Get the three most common subjects for each specified verb
top_subjects_gritar = subjects_for_gritar.most_common(3)
top_subjects_preguntar = subjects_for_preguntar.most_common(3)
top_subjects_responder = subjects_for_responder.most_common(3)

# Print results
print("Ten most common verbs:", common_verbs)
print("\nTop subjects for 'gritar':", top_subjects_gritar)
print("Top subjects for 'preguntar':", top_subjects_preguntar)
print("Top subjects for 'responder':", top_subjects_responder)


In [None]:
'''
We are now going to analyse the difference between foreground and background events, 
using the distinction between the two tenses indefinido (which is said to be used for events which advance the storyline) 
and the imperfecto (used for background circumstances and events).

6) Repeat the extraction of verb-subject pairs for the verb forms in both tenses (they are distinguished in UD by the values Past and Imp of the feature Tense). Which verbs occur more than five times in this novel, but exclusively denote foreground or background events? Based on their translations, do the results make sense? What does the plot appear to be centered on?
7) Can you conclude from your data about the most frequent subjects who the protagonists of the plot are? Are there conspicuous differences in the rankings of people who are the agents in foreground and background events?
'''