### Download the necessary libraries

In [None]:
!pip install pandas seaborn transformers matplotlib  nltk




### Import the Libraries
The modules punkt and stop words from NLTK will also be downloaded

In [None]:

import nltk
# Download the 'punkt' resource
nltk.download('punkt')
# Download stop words
nltk.download('stopwords')
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import matplotlib.pyplot as plt
import string
import pandas as pd
import seaborn as sns
from transformers import pipeline


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Read the files

In [None]:
def read_book(filename):
    """
    Function that reads the content of the books

    Args: filename(str): The filename of the file that needs to be read
          language(str): The language of the text

    Returns: text(str): A string containing the text
    """
    # Open the file and save it in a variable
    with open(filename, "r", encoding="utf-8") as file_to_read:
        text = file_to_read.read()
        # Return the Variable
        return text

###Sentence tokenization
This function is used as a helper function for the sentiment analysis function

In [None]:
def sentence_fragmentation(text):
  """This function takes the text and breaks it into sentences. Although the function is not necesary for the overall purpose of this project,
  it is a helper function for the sentiment analysis.

  Args: text(str): The text that needs to be broke into sentences

  Returns: sentences(lst): A list containing all the sentences"""
  # Break the text into sentences
  sentences=nltk.sent_tokenize(text)
  # Return the list of sentences
  return sentences

### Word Tokenization using nltk.word_tokenize

In [None]:

def word_fragmentation(text):
    """
    Function that tokenize the words. Part of the Lexical Analysis in NLP

    Args: text(str): The text that will be tokenized

    Returns: words(lst): A list of all the words in the text
    """
    # Tokenize the words
    words = nltk.word_tokenize(text)
    # Return the words
    return words


### Text Normalization
The text normalization regions starts here. The tokenized words will be lowerd cased, punctuation and stop words will be removed

***Remove Punctuation***

In [None]:
def remove_punctuation(text):
    """
    Function that removes the punctuation using the string punctuation method. Due to the limitation of it, customization was required in order to eliminated laguage specific punctuation
    Args: text(str): The original text from where punctuation will be removed

    Returns: words_without_punctuation(lst): A list of words without punctuation
    """
    # Call the function that tokenizes the text into words
    words = word_fragmentation(text)
    # Create a customer punctuation to deal with special caracthers that are  language specific
    custom_punctuation = "’“”¿¡„ ” — ''``"
    # Concatinated the string punctuation with the custom punctuation and save in a variable
    punctuation = string.punctuation + custom_punctuation
    # Save the words without punctuation as being each word that is not presented in the punctuation variable
    words_without_punctuation = [word for word in words if word not in punctuation]

    # Return the words without punctuation
    return words_without_punctuation


***Lower Case Words***

In [None]:
def small_letters(text):
    """
    Function that lower cases all words in the text. Part of the Normalization process in NLP.

    Args: text(str): The text that will be lowered cased

    Returns: lowered_cased_words(lst): A list of all words lowered cased
    """
    # Call the word_fragmenation function and save the result in a variable
    words = remove_punctuation(text)
    # Lower case all the words
    lowered_cased_words = [word.lower() for word in words]
    # Return the lowered case words
    return lowered_cased_words

***Remove Stop Words. Language Specific function.***

In [None]:
def remove_stop_words(text, language):
    """
    Function that removes the stop words. Part of the Lexical Analysis/Normalization in NLP

    Args: text(str): The original text that needs to be analyzed
          language(str): The language in which the text is written

    Returns: filtered_words(lst): A list of words except stop words

    Raises: ValueError: If inputed language is not accepted
    """
    # Tokenize the words using the small_letters function
    # Considering that the function does not need to be case sansitive small_letters can be used
    words = small_letters(text)

    # If conditional to decide on the set of stop words
    # If language is English use the English StopWords
    if language.lower() == "english":
        # Save the stop words in a variable
        stop_words = set(nltk.corpus.stopwords.words("english"))

    # If language is Spanish use the Spanish stopwords
    elif language.lower() == "spanish":
        # Save the stop words in a variable
        stop_words = set(nltk.corpus.stopwords.words("spanish"))

    # If language is Romanian use the Romanian stop words
    elif language.lower() == "romanian":
        # Save the stop words in a variable
        stop_words = set(nltk.corpus.stopwords.words("romanian"))
    # If language not Romanian/English/Spanish rise error and print message
    else:
        raise ValueError("Language can only be: english, spanish or romanian")

    # Take out the stopwords from the words
    # Save the filtered words in a variable
    filtered_words = [word for word in words if word not in stop_words]

    # Return the filtered words
    return filtered_words

### Vocabulary richness.
The richness of the vocabulary will be calculated using the formula: type-token ratio = (number of types/number of tokens)

***Extract types/unique words***

In [None]:
def unique_words(text):
    """
    Function that looks for the unique words in the text. It ignores the capital letters and considers words like "Hello", and "hello" as the same words

    Args:   text(str): The text that is analyzed


    Returns: unique_words(set): A set of unique words
    """
    # Call the method and save it in a variable
    words = small_letters(text)
    # Create a set of unique words
    unique_words = set(words)
    # Return the set of unique words
    return unique_words

***Calculate the vocabulary richness***

In [None]:
def vocabulary_richness(text):
    """
    This function determines the vocabulary richness of each book by using the Type-Token-Ration forumla

    Args: text(str): The text that will be analyzed


    Returns: vocab_richness(int): An integer represing the vocabulary richness
    """
    # Call the function that returns all words and save it in a variable
    all_words = small_letters(text)
    # Call the function that returns only the unique words and save the result in a variable
    set_words = unique_words(text)
    # Calculate TTR using the forumla
    vocab_richness = len(set_words) / len(all_words) * 100
    # Return the vocabulary richness as an integer
    return vocab_richness

### Visualization
This project aims to extract the most used and vizualize them in a barchart using Seaborn and Pandas. First the top 10 words will be extracted and saved in a DataFrame, then they will be vizualized.


***Extract Top 10 Words***

In [None]:

def word_frequency(text, language):
    """
    Function that counts the 10 most used words in each text.

    Args: text(str): The text that needs to be analyzed
          language(str): The language in which the text is written

    Returns: df(dataFrame): A dataframe containing the words and their frequency
    """
    # Call the function that removes the stop words
    # This returns a list of words without stop words, punctuation and all lower cased
    words = remove_stop_words(text, language)

    # Create a counter object
    word_counts = Counter(words)

    # Get the top 10 used words and save in a dictionary
    top_10_words = dict(word_counts.most_common(10))

    # Pass the dictionary to a DataFrame
    # Add Column names to the DataFrame
    df = pd.DataFrame(top_10_words.items(), columns=["Word", "Frequency"])

    # Return the top 10 words
    return df

***Plot the Visualization, save the image and show it.***

In [None]:
def visualization_bar_chart(top_ten_original, top_ten_translation, figure_name, language):
    """
    Creates a bar chart visualization using seaborn containing the top 10 most used words in the Original Version and the Romanian Translation together

    Args: top_ten_original(Dataframe): The dataframe containing the words in the original version
          top_ten_translation(Dataframe): The dataframe containing the words in the romanian translation
          figure_name(str): The name under which the figure will be saved
          language(str): The language of the original book

    """

    # Plot a 2 way figure
    figure, axis = plt.subplots(1, 2)

    # Plot the first figure for the original text
    bar_chart1 = sns.barplot(ax=axis[0], data=top_ten_original, x="Word", y="Frequency")

    # Rotate the labels for better visibility
    bar_chart1.set_xticklabels(top_ten_original["Word"], rotation=90)

    # Set a title
    bar_chart1.set_title(f"{language} Original")

    # Label X and Y axis
    bar_chart1.set_xlabel("Word")
    bar_chart1.set_ylabel("Frequency")

    # Plot the second figure for the Romanian stranslation
    bar_chart2 = sns.barplot(
        ax=axis[1], data=top_ten_translation, x="Word", y="Frequency"
    )

    # Rotate the labels for better visibility
    bar_chart2.set_xticklabels(top_ten_translation["Word"], rotation=90)

    # Set a title
    bar_chart2.set_title(" Romanian")
    # Label X and Y axis
    bar_chart2.set_xlabel("Word")
    bar_chart2.set_ylabel("Frequency")

    # Save the figure in a png format
    # Increse the Quality of the Image
    plt.savefig(figure_name, dpi=300)
    # Show the figure
    plt.show()

### Sentiment/Emotion Analysis
Credits to [Daveni](https://huggingface.co/daveni/twitter-xlm-roberta-emotion-es) for the Spanish Emotion Model, [Bhadresh-Savani](https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion) for the English Emotion Model, [Alexandra Ciobanu](https://github.com/Alegzandra) for the data sets provided, and to [Venelin Valkov](https://www.youtube.com/channel/UCoW_WzQNJVAjxo4osNAxd_g) for the wonderful and easy to follow tutorial and explanations.

In [None]:

def emotion_analysis_first_sentence(text, language):
    """
    Function that showcases the emotion analysis for the first sentence of the text.
    Each languages uses a different emotion detection sentiment.
    For Spanish credits are given to Daveni, and the sentiments are:"sadness", "joy", "anger","surprise", "disgust",
    "fear", "others".
    For English credits are due to Bhadresh-Savani, and the sentiments are: "sadness", "joy", "love", "anger",
    "fear", "surprise".
    For Romanian I have developed my own model using Venelin Valkov's tutorial and Alexandra's Ciobanu datasets.
    Please see more details in the report of the code. The sentiments are: "Neutral", "Joy", "Anger", "Fear",
    "Sadness".

    Args: text(str): The text that needs to be analyzed
          language(str): The language of the text that needs to be analyzed

    Returns: emotion(dict): The emotion label and probability

    """


    # If conditional for language decision
    if language.lower() == "spanish":
        # If language is spanish use XLM Roberta model from Daveni
        pipe = pipeline(
            "text-classification", model="daveni/twitter-xlm-roberta-emotion-es"
        )
    elif language.lower() == "english":
        # If language is english use Bert model from Bhadresh Savani
        pipe = pipeline(
            "text-classification",
            model="bhadresh-savani/distilbert-base-uncased-emotion",
        )
    elif language.lower() == "romanian":
        # If language is romanian use bert model
        pipe = pipeline("text-classification", model="cristinaale/romanianlastemotion1")



    # Call the method that returns a list of sentences from the text and save into a variable
    sentences=sentence_fragmentation(text)
    # Get the first sentence
    first_sentence = sentences[:1]

    # Apply emotion analysis to the first sentence
    emotion=pipe(first_sentence)
    # Return the emotion
    return emotion


### Main Function of the program.
In the main the printing will take place, and each language/book pair will be analyzed together in chuncks of code
  

In [None]:
def main():


    # Define variables where to save the language for easier usage
    language1 = "spanish"
    language2 = "romanian"
    language3 = "english"

    # Spanish-Romanian Comparasion

    # Save the Spanish path of the book
    book_spanish = "/content/drive/MyDrive/Python_Final/spanish_text.txt"
    # Save the Romanian Translation of the Spanish book into a Variable
    book_romanian = "/content/drive/MyDrive/Python_Final/romanian_spanish.txt"

    # Read the books by calling the function

    # Read Spanish book
    text_spanish = read_book(book_spanish)

    # Read Romanian translation of the Spanish book
    text_romanian = read_book(book_romanian)


    # Return the number of words for each text by calling the method and applying the len method to the return
    words_spanish=remove_punctuation(text_spanish)
    print("The Spanish Book has:", len(words_spanish), "words")

    words_romanian=remove_punctuation(text_romanian)
    print("The Romanian book has:", len(words_romanian), "words")

    # Aestetics blankspace
    print()

    # Calculate the vocabulary richness of the books by calling the function and adding the language variable
    # Spanish Vocabulary richness
    spanish_vocabulary_richness = vocabulary_richness(text_spanish)

    # Romanian Translation richness
    romanian_vocabulary_richness = vocabulary_richness(text_romanian)

    # Print the vocabulary richness integers
    # Print the Spanish vocabulary richness
    print("The Spanish text has a vocabulary richness of", spanish_vocabulary_richness)
    # Blankspace for aestetics
    print()

    # Print the Romanian Translation Vocabulary richness
    print(
        "The Romanian text has a vocabulary richness of", romanian_vocabulary_richness
    )
    # Blanckspace for aestetics
    print()

    # Top 10 words by calling the function
    # DataFrame containing the top 10 most used words in the Spanish book
    spanish_top_10 = word_frequency(text_spanish, language1)

    # DataFrame containing the top 10 most used words in the Romanian translation of the Spanish book
    romanian_top_10 = word_frequency(text_romanian, language2)

    # Create the vizualization for Spanishand Romanian top 10 words by calling the function
    # That uses the spanish_top_10 and romanian_top_10 dataframes
    visualization_bar_chart(spanish_top_10, romanian_top_10, "romanian_spanish.png", language1)


    # Get the sentiment analysis of the 100 lenght chunks
    # Get the sentiment analysis for the Spanish book
    spanish_sentiment = emotion_analysis_first_sentence(text_spanish, language1)

    # Get the sentiment analysis for the Romanian translation of the Spanish book
    romanian_sentiment = emotion_analysis_first_sentence(text_romanian, language2)

    # Print the sentiment analysis results for each language
    # Print the sentiment analysis for the Spanish book
    print("The Spanish Sentiment Analysis:", spanish_sentiment)

    # Print the sentiment analysis result for the Romanian translation of the Spanish book
    print("The Romanian Sentiment Anaalysis:", romanian_sentiment)

    # English Romanian Comparision

    # Save the paths of the books
    # Save the path of the English book
    book_english = "/content/drive/MyDrive/Python_Final/english_text.txt"

    # Save the path of the Romanian translation of the English book
    book_romanian2 = "/content/drive/MyDrive/Python_Final/romanian_english.txt"

    # Read the books by calling the method
    # Read the English text
    text_english = read_book(book_english)

    # Read the Romanian translation of the English text
    text2_romanian = read_book(book_romanian2)

    # Return the number of words for each text by calling the method and applying the len method to the return
    # Use remove_punctuation function to get the number of words without punctuation
    words_english=remove_punctuation(text_english)
    print("The English Book has:", len(words_english), "words")

    words_romanian2=remove_punctuation(text2_romanian)
    print("The Romanian book has:", len(words_romanian2), "words")

    # Calculate the vocabulary richness of the books by calling the function
    # The English text vocabularry richness
    english_vocabulary_richness = vocabulary_richness(text_english)

    # The Romanian translation vocabulary richness
    romanian_vocabulary_richness2 = vocabulary_richness(text2_romanian)

    # Blankspace for aestetics
    print()

    # Print the vocabulary richness integers
    # Print the English vocabulary richness
    print("The English text has a vocabulary richness of", english_vocabulary_richness)
    # Blankspace for aesetics
    print()

    # Print the Romanian vocabulary richness
    print(
        "The Romanian text has a vocabulary richness of", romanian_vocabulary_richness2
    )
    # Blank Space for Aestetics
    print()

    # Top 10 most used words in the book
    # DataFrame containing the top 10 most used words in the English book
    english_top_10 = word_frequency(text_english, language3)

    # DataFrame containing the top 10 most used words in the Romanian translation
    romanian2_top_10 = word_frequency(text2_romanian, language2)

    # Create the vizualization for Spanishand Romanian top 10 words by calling the function
    # That uses the english_top_10 and romanian_top_10
    visualization_bar_chart(english_top_10, romanian2_top_10, "romanian_english.png", language3)


    # Get the sentiment analysis of the 100 lenght chunks
    # Get the English book sentiment
    english_sentiment = emotion_analysis_first_sentence(text_spanish, language3)
    romanian_sentiment2 = emotion_analysis_first_sentence(text2_romanian, language2)

    # Print the sentiment analysis results for each language
    # Print the English sentiment
    print("The English Sentiment Analysis:", english_sentiment)
    print("The Romanian Sentiment Anaalysis:", romanian_sentiment2)



### Calling the Main

In [None]:
if __name__ == "__main__":
    main()
