<a href="https://www.kaggle.com/code/diaconumadalina/summarization-reviews?scriptVersionId=158488335" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## 2 Setup

### 2.1 Import and configure libraries 

In [None]:
# Data manipulation libraries
import pandas as pd
import spacy
from spacy import displacy # is used for visualizing the dependency parse tree and named entity recognition (NER) annotations.

# General Imports
import time

# Data modeling libraries
from sklearn.model_selection import train_test_split

# text processing and cleaning
import re # This line imports the regular expression (regex) module, which provides functions for working with regular expressions. 
import nltk
from nltk.stem import WordNetLemmatizer #  is used for lemmatization, which is the process of reducing words to their base or root form: from running to run
from nltk.corpus import stopwords #The stopwords corpus from NLTK contains common words that are often removed from text during text preprocessing/ These words (like 'and', 'the', 'is', etc.) are considered as noise in many natural language processing tasks 

# Display in jupyter
# from IPython.core.display import display, HTML
# # Set the width of the output cell
# display(HTML("<style>.container { width:80% !important; }</style>"))


### 2.2 Constants and helper functions 

#### Constants

#### Helper functions

In [None]:
def load_dataset_from_json(json_file_path):
    """
    :param json_file_path (str) :Path to the JSON file.
    :return: pd.DataFrame: DataFrame containing the loaded data.
    """
    df = pd.read_json(json_file_path)
    return df


### 2.3 Set dataframe 

In [None]:
df = load_dataset_from_json("/kaggle/input/amazon-one-plus-reviews/amazon_one_plus_reviews.json")
df.head(3)

In [None]:
# conda update --all

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
# The dataset from worlbank contains information about 3 types of produc in our application we need for startjust one product
df['product'].value_counts()

In [None]:
mask = df['product'] == 'Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage)'
df = df[mask].reset_index(drop=True)
df = df[['reviewed_at', 'review_text', 'review_title']]
df = df.rename(columns = {'review_title' : 'Summary', 'review_text' : 'Review', 'reviewed_at' : 'Date'})

In [None]:
# Format the date as "30 August 2021"
df['Date'] = df['Date'].apply(lambda x: x.strftime('%d %B %Y') if not pd.isnull(x) else '')

### 3 EDA

##### For this analyse I will use the product named Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage) 13934


In [None]:
df.head(3)

In [None]:
df.info()

### 4 Training Setup

### 4.1 Train, Test split

##### Train, Test split

In [None]:
X = df.drop(["Summary"], axis =1 )
X_train, X_test = train_test_split(X , test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

##### Sentence Tokenization


In [3]:
def split_review_custom_delimiters(text):
    """
    This function splits the review into multiple sentences based on custom delimiters.
    
    Args:
        text (str): The input text to be split.
    Returns:
        list: A list of sentences after splitting based on the specified custom delimiters.
    """
    delimiters = ".", "but", "and", "also"
    escaped_delimiters = map(re.escape, delimiters) # Result: ['\\.', 'but', 'and', 'also']
    regex_pattern = '|'.join(escaped_delimiters) # Applying the custom delimiters # Result: '\\.|but|and|also'
    splitted = re.split(regex_pattern, text) # Splitting the review function from the re module to split the input text into a list of substrings based on the specified regular expression pattern.
    return[sentence.strip() for sentence in splitted if sentence.strip()] #this line ensures that only non-empty sentences (after stripping whitespaces) are included in the final result.  sentence.strip(): Strips any leading or trailing whitespaces from the sentence.

##### Data cleaning

In [None]:
nltk.download('stopwords')

In [None]:
lemma = WordNetLemmatizer()
all_stopwords = set(stopwords.words('english'))

custom_stopwords = ['not', 'but', 'because', 'against', 'between', 'up', 'down', 'in', 'out', 'once', 'before','after', 'few', 'more', 'most', 'no', 'nor', 'same', 'some']

for stopword in custom_stopwords:
    all_stopwords.remove(stopword)

def clean_aspect_spacy(reviews):
    """
    this function prepares text for analysis by cleaning it, making it more consistent, 
    and removing elements that may not carry substantial meaning for downstream tasks in natural language processing as punctuations, stopwords, and other non-alphanumeric characters.
    It expands contractions and replaces some words with an empty string.
    
    Args:
        reviews (str): The text to be cleaned.
        lemma (WordNetLemmatizer): An instance of WordNetLemmatizer for lemmatization.
        all_stopwords (set): A set of stopwords to be removed from the text.

    Returns:
        str: The cleaned and preprocessed text.
        
    """
    text = reviews.lower()
    
    contractions = {
        "won't": "will not",
        "cannot": "can not",
        "can't": "can not",
        "n't": " not",
        "what's": "what is",
        "it's": "it is",
        "'ve": " have",
        "i'm": "i am",
        "'re": " are",
        "he's": "he is",
        "she's": "she is",
        "*****": " ",        
    }
    
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
        
    # Remove special characters, numbers, and extra spaces.
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(' +', ' ', text)
    
    # Lemmatization and removing stopwords
    words = text.split()
    cleaned_words = [lemma.lemmatize(word) for word in words if word not in set(all_stopwords)]
    
    # Join the cleaned words back into a sentence
    cleaned_text = ' '.join(cleaned_words) 
    
    return cleaned_text

##### Reconstruct the DataFrame

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# pip install --upgrade nltk

In [None]:
def split_and_clean_reviews(df):
    """
    This function takes a DataFrame 'df' with 'Review' and 'Date' columns, splits each review into smaller components,
    filters out components with fewer than three words, and applies a text cleaning function to each split and cleaned review.
    
    Parameters:
    - df (DataFrame): Input DataFrame with 'Review' and 'Date' columns.

    Returns:
    DataFrame: A new DataFrame with 'Date' and 'Review' columns, where each review has been split, filtered, and cleaned.
    """
    reviews = []
    dates = []
    
    for i, review_text in enumerate(df["Review"].values):
        review_split = split_review_custom_delimiters(review_text)
        
        # Filter out components with fewer than three words
        review_split_filtered = [split for split in review_split if len(split.split()) >= 3]
    
        # Duplicate dates as string for the corresponding split reviews
        duplicate_dates = [str(df["Date"].values[i]) for _ in range(len(review_split_filtered))]

        # Extend the lists with split and duplicated reviews and dates
        reviews.extend(review_split_filtered)
        dates.extend(duplicate_dates)

    # Apply the text cleaning function to each split and cleaned review    
    cleaned_reviews = [clean_aspect_spacy(text) for text in reviews]
    
    # Create a new DataFrame with 'Date' and 'Review' columns
    data = pd.DataFrame({"Date": dates, "Review": cleaned_reviews})
    
    return data

In [None]:
# pip install --upgrade nltk

In [None]:
# Train set

start_time = time.time()
train_data = split_and_clean_reviews(X_train)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

In [None]:
# Test set

start_time = time.time()
test_data = split_and_clean_reviews(X_test)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

In [None]:
train_data.head(3)

In [None]:
test_data.head(3)

##### Aspect extraction -  process of identifying and extracting specific aspects, features, or attributes from textual data

In [None]:
def apply_extraction(row, nlp):
    """
    This function extracts aspect and its corresponding description from the review by 
    applying 7 different rules of POS tagging.
    
    Args:
        row (pd.Series): A row from a DataFrame containing the 'Review' column.
        nlp (spacy.Language): The spaCy NLP pipeline.

    Returns:
        dict: A dictionary containing the extracted aspect pairs.
        
    """
    prod_pronouns = ['it', 'this', 'they', 'these']
    review_body = row['Review']
    doc = nlp(review_body)
    
    aspect_pairs = []
    
    for token in doc:
        # Rule 1
        aspect_pairs.extend(rule1(token, prod_pronouns))
        
        # Rule 2
        aspect_pairs.extend(rule2(token, prod_pronouns))
        
        # Rule 3
        aspect_pairs.extend(rule3(token, prod_pronouns))
        
        # Rule 4
        aspect_pairs.extend(rule4(token, prod_pronouns))
        
        # Rule 5
        aspect_pairs.extend(rule5(token, prod_pronouns))
        
        # Rule 6
        aspect_pairs.extend(rule6(token, prod_pronouns))
        
        # Rule 7
        aspect_pairs.extend(rule7(token, prod_pronouns))
        
    return {"aspect_pairs": aspect_pairs}

## The rules below extract aspects (A) and sentiment modifiers (M) based on the specified dependency relationships

Certainly! Let's include information about the head of the children for each token in the new sentence "She quickly ate the delicious cake."

**A. Dependency Parsing Concepts:**

Let's simplify it even further:

Alright, let's simplify it further:

**1. Child:it's connected to another word -> Each word has a specific role, and it's connected to another word that guides or influences it nsubj, advmod, dobj;**
- Think of words in a sentence like members of a team. Each word has a specific role, and it's connected to another word that guides or influences it.
- The connection between words has a special label that tells us the job or role of each word in relation to the other.

**2. Head of a Token: BOOS determines what role each word plays in the sentence**
- Imagine words in a sentence as a group, and each word has a leader. This leader (head) determines what role each word plays in the sentence.
- The leader word is like the boss, directing others and making sure everything fits together properly.

In short, a child is a team member with a specific role, connected to another word that guides it. The head is the leader that directs and organizes the roles of all the words in the sentence.

So, in simpler terms, a child is like a family member with a specific role, and the head is like the boss, guiding and influencing the roles of others in the sentence.

3. **Head of Children:**
   - Refers to the word that governs the grammatical relationship with the children of a token.

**B. Example Application on "She quickly ate the delicious cake":**

1. **"She":**
   - Child: None (no dependents)
   - Head: "ate"
   - Head of Children: None (no children)

2. **"Ate":**
   - Children: "She" (nsubj), "quickly" (advmod), "cake" (dobj)
   - Head: None (root of the dependency tree)
   - Head of Children: "She" (nsubj), "quickly" (advmod), "cake" (dobj)

3. **"Quickly":**
   - Child: None (no dependents)
   - Head: "ate"
   - Head of Children: None (no children)

4. **"The":**
   - Child: None (no dependents)
   - Head: "cake"
   - Head of Children: None (no children)

5. **"Delicious":**
   - Child: None (no dependents)
   - Head: "cake"
   - Head of Children: None (no children)

6. **"Cake":**
   - Children: "The" (det), "delicious" (amod)
   - Head: "ate"
   - Head of Children: "The" (det), "delicious" (amod)

**Summary of Relationships:**
- "She" has "ate" as its head. No children.
- "Ate" has "She" (nsubj), "quickly" (advmod), and "cake" (dobj) as its children and has no head (it is the root). The head of children is "She" (nsubj), "quickly" (advmod), "cake" (dobj).
- "Quickly" has "ate" as its head. No children.
- "The" and "delicious" both have "cake" as their head. No children.
- "Cake" has "The" (det) and "delicious" (amod) as its children and "ate" as its head. The head of children is "The" (det), "delicious" (amod).

This breakdown provides a more detailed view of the relationships between each token, its head, and the children of each token, including the head of the children.

In [None]:
def rule1(token, prod_pronouns):
    """
    Apply Rule 1: Extract aspect and its corresponding description from the review.
    
    **Adverbial Modifier of Adjective Rule :**

    This rule focuses on extracting aspects (A) and their corresponding sentiment modifiers (M) from a sentence. 
    It specifically looks for adjectival modifiers (`amod`) in the dependency relations of a token, excluding stop words. 
    The aspect (A) is identified as the head token's text, and the sentiment modifier (M) is updated with the current token's text. 
    Additionally, it considers adverbial modifiers of adjectives and handles negation in adjectives. 
    The result is formatted, and if the aspect is one of the specified pronouns, it is replaced with "product."
        
    Args:
        token (spacy.Token): The input token.
        prod_pronouns (list): List of pronouns to be replaced with "product."

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (1).
        
    """
    A, M = "999999", "999999" # A - the aspect or feature being described in the sentence, in the phrase "sound quality," "sound" would be the aspect. M - Sentiment Modifier In the phrase "good sound quality," "good" would be the sentiment modifier.
    
    if token.dep_ == "amod" and not token.is_stop: # checks if the token has a dependency relation of "amod" (adjectival modifier) and is not a stop word.
        M = token.text # it updates M with the current token's text 
        A = token.head.text # and A with the head token's text - the aspect or feature being described in the sentence
        
        # add adverbial modifier of adjective (e.g. 'most comfortable headphones')
        M_children = [child_m.text for child_m in token.children if child_m.dep_ == "advmod"]
        if M_children:
            M = " ".join([M] + M_children)
            
        # negation in adjective, the "no" keyword is a determiners of the noun (e.g., no interesting characters) ; Determiners include articles (a, an, the), demonstratives (this, that, these, those), possessive pronouns (my, your, his, her, its, our, their), and other words that provide information about the noun.
        A_children = [child_a for child_a in token.head.children if child_a.dep_ == "det" and child_a.text == 'no']
        if A_children:
            neg_prefix = 'not'
            M = f"{neg_prefix} {M}"

    if A != "999999" and M != "999999":
        if A in prod_pronouns:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 1}]
    return []

In [None]:
def rule2(token):
    """
    Apply Rule 2: Extract aspect and its corresponding description from the review.
    
    **Direct Object Rule:**
    
    This rule extracts aspects (A) and sentiment modifiers (M) from a sentence by focusing on the relationship 
    between the nominal subject (A) and the direct object (M) of a verb.
    It assumes that a verb has only one nominal subject and one direct object.
    
    Args:
        token (spacy.Token): The input token.

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (2).
    """
    A, M = "999999", "999999"
    add_neg_pfx = False

    # Iterate through the children of the token
    for child in token.children:
        # If the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # If the child is the direct object (dobj) and its part-of-speech is ADJ (adjective), and not a stop word
        if child.dep_ == "dobj" and child.pos_ == "ADJ" and not child.is_stop:
            M = child.text
    
            # If the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True
            
        # If negation is present, add the negation prefix to the sentiment modifier (M)
        if add_neg_pfx and M != "999999":
            M = f"{neg_prefix} {M}"
        
        # If both aspect (A) and sentiment modifier (M) are valid, format the result
        if A != "999999" and M != "999999":
            # If aspect is one of the specified pronouns, replace it with "product"
            if A in prod_pronouns:
                A = "product"
            return [{"noun": A, "adj": M, "rule": 2}]

        # If no valid aspect-sentiment pair is found, return an empty list
        return []

In [None]:
def rule3(token, prod_pronouns):
    """
    Apply Rule 3: Extract aspect and its corresponding description from the review.
    
    **Adjectival Complement Rule:**

    This rule identifies aspects (A) and sentiment modifiers (M) by examining the relationship between the nominal subject (A) 
    and the adjectival complement (M) in a sentence. It assumes a verb has a single nominal subject and considers the adjectival complement as 
    the sentiment modifier. The presence 
    of an auxiliary (AUX) dependency helps handle cases like "The sound of the speakers would be better" or "The sound of the speakers could be better."
    
    Args:
        token (spacy.tokens.Token): The token to analyze.
        prod_pronouns (list): List of product pronouns.

    Returns:
        list: A list of dictionaries containing aspect, sentiment modifier, and rule number (3).
    """
    A, M = "999999", "999999"
    add_neg_pfx = False

    for child in token.children:
        # Check if the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # Check if the child is the adjectival complement (acomp) and not a stop word
        if child.dep_ == "acomp" and not child.is_stop:
            M = child.text

        # Example: 'this could have been better' -> (this, not better)
        # If a child is an auxiliary (aux) with the tag "MD"  - Modal Auxiliary or a negation word (MD = "can," "could," "will," "would," "shall," "should," "may," "might," and "must.")
        if child.dep_ == "aux" and child.tag_ == "MD":
            neg_prefix = "not"
            add_neg_pfx = True

        # Check if the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True

    # If negation is present, add the negation prefix to the sentiment modifier (M)
    if add_neg_pfx and M != "999999":
        M = f"{neg_prefix} {M}"

    # If both aspect (A) and sentiment modifier (M) are valid, format the result
    if A != "999999" and M != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in prod_pronouns:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 3}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []


In [None]:
def rule4(token):
    """
    Apply Rule 4: Extract aspect and its corresponding description from the review.
    
    **Passive Adverbial Modifier Rule:**

    This rule focuses on identifying aspects (A) and sentiment modifiers (M) by examining the relationship between the nominal subject (A) and 
    the adverbial modifier (M) in a sentence where
    the verb is in the passive voice (nsubjpass) or nsubj. It assumes a verb has a single nominal subject, and the adverbial modifier serves as the sentiment modifier.
    
    * nominal subject (nsubj) or passive nominal subject (nsubjpass)
    
    Args:
    - token (spacy.Token): A token from a processed Spacy document.

    Returns:
    - list: A list containing dictionaries with extracted aspect-sentiment pairs based on Rule 4. Each dictionary has keys 'noun', 'adj', and 'rule'.
            If no valid aspect-sentiment pair is found, an empty list is returned.
    """
    
    
    A, M = "999999", "999999"
    add_neg_pfx = False
    children = token.children

    # Iterate through the children of the token
    for child in children:
        # If the child is the nominal subject (nsubjpass or nsubj) and not a stop word
        if (child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop:
            A = child.text

        # If the child is an adverbial modifier (advmod) and not a stop word
        if child.dep_ == "advmod" and not child.is_stop:
            M = child.text
            
            # Check for additional adverbial modifier of the main advmod
            M_children = child.children
            for child_m in M_children:
                if child_m.dep_ == "advmod":
                    M_hash = child_m.text
                    M = M_hash + " " + child.text
                    break

        # If the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True

    # If negation is present, add the negation prefix to the sentiment modifier (M)
    if add_neg_pfx and M != "999999":
        M = neg_prefix + " " + M

    # If both aspect (A) and sentiment modifier (M) are valid, format the result
    if A != "999999" and M != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in ['it', 'this', 'they', 'these']:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 4}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []


In [None]:
def rule5(token):
    """
    Apply Rule 5: Extract aspect and its corresponding description from the review.
    
    **Copular Verb Complement Rule:**

    This rule identifies aspects (A) and sentiment modifiers (M) by examining the relationship between the nominal subject (A) and the complement (M)
    of a copular verb.
    In this context, it assumes that a verb has only one nominal subject, and the complement serves as the sentiment modifier.
        
    For example, in the sentence "The product is durable," the copula is "is," and the nominal subject is "The product.

    Args:
    - token (spacy.Token): A token from a processed Spacy document.

    Returns:
    - list: A list containing dictionaries with extracted aspect-sentiment pairs based on Rule 5. 
            Each dictionary has keys 'noun', 'adj', and 'rule'.
            If no valid aspect-sentiment pair is found, an empty list is returned.
    """
    A, buf_var = "999999", "999999"
    children = token.children

    # Iterate through the children of the token
    for child in children:
        # If the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # If the child is the copula (cop) and not a stop word
        if child.dep_ == "cop" and not child.is_stop:
            buf_var = child.text

    # If both aspect (A) and copula (buf_var) are valid, format the result
    if A != "999999" and buf_var != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in ['it', 'this', 'they', 'these']:
            A = "product"
        return [{"noun": A, "adj": token.text, "rule": 5}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []


In [None]:
def rule6(token):
    """
    Apply Rule 6: Extract aspect and its corresponding description from the review.
    
    **Interjection Rule:**

    This rule focuses on extracting aspects (A) and sentiment modifiers (M) from interjections. 
    In examples like "It's ok," where "ok" is classified as an interjection (INTJ), it considers the interjection as the sentiment modifier.

    Args:
        token (spacy.Token): The token to analyze.

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (6).
    """
    A, M = "999999", "999999"

    # Check if the token is an interjection (INTJ) and not a stop word
    if token.pos_ == "INTJ" and not token.is_stop:
        children = token.children
        for child in children:
            # If the child is the nominal subject (nsubj) and not a stop word
            if child.dep_ == "nsubj" and not child.is_stop:
                A = child.text
                M = token.text

    # If both aspect (A) and sentiment modifier (M) are valid, format the result
    if A != "999999" and M != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in ['it', 'this', 'they', 'these']:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 6}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []


In [None]:
def rule7(token):
    """
    Apply Rule 7: Extract aspect and its corresponding description from the review.
    
    **Complement Rule:**

    This rule identifies aspects (A) and sentiment modifiers (M) by looking at the link between a verb like 'be/seem/appear' and its complement 
    using the `ATTR` relationship. For example, in the sentence 'this is garbage,' it extracts the aspect 'this' and the sentiment modifier 'garbage.'

    Args:
        token (spacy.Token): The token to analyze.

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (7).
    """
    A, M = "999999", "999999"
    add_neg_pfx = False
    children = token.children

    # Iterate through the children of the token
    for child in children:
        # If the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # If the child is the attribute (attr) and not a stop word
        if child.dep_ == "attr" and not child.is_stop:
            M = child.text

        # If the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True

    # If negation is present, add the negation prefix to the sentiment modifier (M)
    if add_neg_pfx and M != "999999":
        M = neg_prefix + " " + M

    # If both aspect (A) and sentiment modifier (M) are valid, format the result
    if A != "999999" and M != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in ['it', 'this', 'they', 'these']:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 7}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []
