<a href="https://www.kaggle.com/code/diaconumadalina/summarization-reviews?scriptVersionId=158474899" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## 2 Setup

### 2.1 Import and configure libraries 

In [None]:
# Data manipulation libraries
import pandas as pd
import spacy
from spacy import displacy # is used for visualizing the dependency parse tree and named entity recognition (NER) annotations.

# General Imports
import time

# Data modeling libraries
from sklearn.model_selection import train_test_split

# text processing and cleaning
import re # This line imports the regular expression (regex) module, which provides functions for working with regular expressions. 
import nltk
from nltk.stem import WordNetLemmatizer #  is used for lemmatization, which is the process of reducing words to their base or root form: from running to run
from nltk.corpus import stopwords #The stopwords corpus from NLTK contains common words that are often removed from text during text preprocessing/ These words (like 'and', 'the', 'is', etc.) are considered as noise in many natural language processing tasks 

# Display in jupyter
# from IPython.core.display import display, HTML
# # Set the width of the output cell
# display(HTML("<style>.container { width:80% !important; }</style>"))


### 2.2 Constants and helper functions 

#### Constants

#### Helper functions

In [None]:
def load_dataset_from_json(json_file_path):
    """
    :param json_file_path (str) :Path to the JSON file.
    :return: pd.DataFrame: DataFrame containing the loaded data.
    """
    df = pd.read_json(json_file_path)
    return df


### 2.3 Set dataframe 

In [None]:
df = load_dataset_from_json("/kaggle/input/amazon-one-plus-reviews/amazon_one_plus_reviews.json")
df.head(3)

In [None]:
# conda update --all

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
# The dataset from worlbank contains information about 3 types of produc in our application we need for startjust one product
df['product'].value_counts()

In [None]:
mask = df['product'] == 'Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage)'
df = df[mask].reset_index(drop=True)
df = df[['reviewed_at', 'review_text', 'review_title']]
df = df.rename(columns = {'review_title' : 'Summary', 'review_text' : 'Review', 'reviewed_at' : 'Date'})

In [None]:
# Format the date as "30 August 2021"
df['Date'] = df['Date'].apply(lambda x: x.strftime('%d %B %Y') if not pd.isnull(x) else '')

### 3 EDA

##### For this analyse I will use the product named Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage) 13934


In [None]:
df.head(3)

In [None]:
df.info()

### 4 Training Setup

### 4.1 Train, Test split

##### Train, Test split

In [None]:
X = df.drop(["Summary"], axis =1 )
X_train, X_test = train_test_split(X , test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

##### Sentence Tokenization


In [3]:
def split_review_custom_delimiters(text):
    """
    This function splits the review into multiple sentences based on custom delimiters.
    
    Args:
        text (str): The input text to be split.
    Returns:
        list: A list of sentences after splitting based on the specified custom delimiters.
    """
    delimiters = ".", "but", "and", "also"
    escaped_delimiters = map(re.escape, delimiters) # Result: ['\\.', 'but', 'and', 'also']
    regex_pattern = '|'.join(escaped_delimiters) # Applying the custom delimiters # Result: '\\.|but|and|also'
    splitted = re.split(regex_pattern, text) # Splitting the review function from the re module to split the input text into a list of substrings based on the specified regular expression pattern.
    return[sentence.strip() for sentence in splitted if sentence.strip()] #this line ensures that only non-empty sentences (after stripping whitespaces) are included in the final result.  sentence.strip(): Strips any leading or trailing whitespaces from the sentence.

##### Data cleaning

In [None]:
nltk.download('stopwords')

In [None]:
lemma = WordNetLemmatizer()
all_stopwords = set(stopwords.words('english'))

custom_stopwords = ['not', 'but', 'because', 'against', 'between', 'up', 'down', 'in', 'out', 'once', 'before','after', 'few', 'more', 'most', 'no', 'nor', 'same', 'some']

for stopword in custom_stopwords:
    all_stopwords.remove(stopword)

def clean_aspect_spacy(reviews):
    """
    this function prepares text for analysis by cleaning it, making it more consistent, 
    and removing elements that may not carry substantial meaning for downstream tasks in natural language processing as punctuations, stopwords, and other non-alphanumeric characters.
    It expands contractions and replaces some words with an empty string.
    
    Args:
        reviews (str): The text to be cleaned.
        lemma (WordNetLemmatizer): An instance of WordNetLemmatizer for lemmatization.
        all_stopwords (set): A set of stopwords to be removed from the text.

    Returns:
        str: The cleaned and preprocessed text.
        
    """
    text = reviews.lower()
    
    contractions = {
        "won't": "will not",
        "cannot": "can not",
        "can't": "can not",
        "n't": " not",
        "what's": "what is",
        "it's": "it is",
        "'ve": " have",
        "i'm": "i am",
        "'re": " are",
        "he's": "he is",
        "she's": "she is",
        "*****": " ",        
    }
    
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
        
    # Remove special characters, numbers, and extra spaces.
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(' +', ' ', text)
    
    # Lemmatization and removing stopwords
    words = text.split()
    cleaned_words = [lemma.lemmatize(word) for word in words if word not in set(all_stopwords)]
    
    # Join the cleaned words back into a sentence
    cleaned_text = ' '.join(cleaned_words) 
    
    return cleaned_text

##### Reconstruct the DataFrame

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# pip install --upgrade nltk

In [None]:
def split_and_clean_reviews(df):
    """
    This function takes a DataFrame 'df' with 'Review' and 'Date' columns, splits each review into smaller components,
    filters out components with fewer than three words, and applies a text cleaning function to each split and cleaned review.
    
    Parameters:
    - df (DataFrame): Input DataFrame with 'Review' and 'Date' columns.

    Returns:
    DataFrame: A new DataFrame with 'Date' and 'Review' columns, where each review has been split, filtered, and cleaned.
    """
    reviews = []
    dates = []
    
    for i, review_text in enumerate(df["Review"].values):
        review_split = split_review_custom_delimiters(review_text)
        
        # Filter out components with fewer than three words
        review_split_filtered = [split for split in review_split if len(split.split()) >= 3]
    
        # Duplicate dates as string for the corresponding split reviews
        duplicate_dates = [str(df["Date"].values[i]) for _ in range(len(review_split_filtered))]

        # Extend the lists with split and duplicated reviews and dates
        reviews.extend(review_split_filtered)
        dates.extend(duplicate_dates)

    # Apply the text cleaning function to each split and cleaned review    
    cleaned_reviews = [clean_aspect_spacy(text) for text in reviews]
    
    # Create a new DataFrame with 'Date' and 'Review' columns
    data = pd.DataFrame({"Date": dates, "Review": cleaned_reviews})
    
    return data

In [None]:
# pip install --upgrade nltk

In [None]:
# Train set

start_time = time.time()
train_data = split_and_clean_reviews(X_train)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

In [None]:
# Test set

start_time = time.time()
test_data = split_and_clean_reviews(X_test)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

In [None]:
train_data.head(3)

In [None]:
test_data.head(3)

##### Aspect extraction -  process of identifying and extracting specific aspects, features, or attributes from textual data

In [None]:
def apply_extraction(row, nlp):
    """
    This function extracts aspect and its corresponding description from the review by 
    applying 7 different rules of POS tagging.
    
    Args:
        row (pd.Series): A row from a DataFrame containing the 'Review' column.
        nlp (spacy.Language): The spaCy NLP pipeline.

    Returns:
        dict: A dictionary containing the extracted aspect pairs.
        
    """
    prod_pronouns = ['it', 'this', 'they', 'these']
    review_body = row['Review']
    doc = nlp(review_body)
    
    aspect_pairs = []
    
    for token in doc:
        # Rule 1
        aspect_pairs.extend(rule1(token, prod_pronouns))
        
        # Rule 2
        aspect_pairs.extend(rule2(token, prod_pronouns))
        
        # Rule 3
        aspect_pairs.extend(rule3(token, prod_pronouns))
        
        # Rule 4
        aspect_pairs.extend(rule4(token, prod_pronouns))
        
        # Rule 5
        aspect_pairs.extend(rule5(token, prod_pronouns))
        
        # Rule 6
        aspect_pairs.extend(rule6(token, prod_pronouns))
        
        # Rule 7
        aspect_pairs.extend(rule7(token, prod_pronouns))
        
    return {"aspect_pairs": aspect_pairs}

## The rules below extract aspects (A) and sentiment modifiers (M) based on the specified dependency relationships

**Child**: In the syntactic structure of a sentence, a "child" refers to a word that is directly connected to another word (the "parent") in a hierarchical tree structure. The relationship between a child and its parent is defined by a specific dependency label, such as nsubj (nominal subject), acomp (adjectival complement), aux (auxiliary), etc. Children are elements that depend on another word to form a grammatically correct sentence.

The term "**head of a token**" in the context of dependency parsing refers to the word that governs the grammatical relationship with the token in question. Every word in a sentence has a syntactic relationship with another word, and the head of a token is the word that determines this relationship.

**A In the given sentence "The cat chased the mouse," let's break down the relationships between each word and its children and head in a simple dependency structure:**

1. **The:**
   - Child: None (it has no dependents)
   - Head: cat

2. **cat:**
   - Child: The (det)
   - Head: chased

3. **chased:**
   - Children: cat (nsubj), mouse (dobj)
   - Head: None (it is the root of the dependency tree)

4. **the:**
   - Child: None (it has no dependents)
   - Head: mouse

5. **mouse:**
   - Child: the (det)
   - Head: chased

So, in summary:

- "The" has "cat" as its head.
- "Cat" has "The" as its child and "chased" as its head.
- "Chased" has "cat" and "mouse" as its children and has no head (it is the root).
- "The" has "mouse" as its head.
- "Mouse" has "the" as its child and "chased" as its head.

In this way, each token has relationships with other tokens, and the head of a token is the word that governs the grammatical relationship with that token. The children of a token are the words that depend on it in the sentence structure.


**B In next example, each token is processed, and we print information about its children, head, and the children of its head.**

To provide examples of `token.children`, `token.head`, and `token.head.children`, I'll use the sentence "The cat chased the mouse" and assume that the sentence has been parsed into a dependency tree. Let's use a simple example where "chased" is the root of the dependency tree.

```plaintext
The (det) cat (nsubj) chased (root) the (det) mouse (dobj)
```

In this example:
- `token` refers to each word in the sentence.
- `token.children` refers to the immediate dependents of the token.
- `token.head` refers to the token's syntactic parent in the dependency tree.
- `token.head.children` refers to the immediate dependents of the token's syntactic parent.

Now, let's go through the examples:

1. For the word "cat":
   - `token.children`: None (it has no dependents)
   - `token.head`: chased
   - `token.head.children`: The (det), chased (nsubj)

2. For the word "chased":
   - `token.children`: The (det), cat (nsubj), mouse (dobj)
   - `token.head`: None (it is the root of the dependency tree)
   - `token.head.children`: The (det), cat (nsubj), mouse (dobj)

3. For the word "mouse":
   - `token.children`: None (it has no dependents)
   - `token.head`: chased
   - `token.head.children`: the (det)


In [None]:
def rule1(token, prod_pronouns):
    """
    Apply Rule 1: Extract aspect and its corresponding description from the review.
    
    A - the aspect or feature being described in the sentence. In the phrase "sound quality," "sound" would be the aspect.
    M - Sentiment Modifier. In the phrase "good sound quality," "good" would be the sentiment modifier.
    
    This rule checks if the token has a dependency relation of "amod" (adjectival modifier) and is not a stop word.
    It updates M with the current token's text and A with the head token's text - the aspect or feature being described in the sentence.
    It also considers adverbial modifiers of adjectives and handles negation in adjectives.
    
    If both aspect (A) and sentiment modifier (M) are valid, it formats the result, and if A is one of the specified pronouns, it replaces it with "product."
    
    Args:
        token (spacy.Token): The input token.
        prod_pronouns (list): List of pronouns to be replaced with "product."

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (1).
        
    """
    A, M = "999999", "999999" # A - the aspect or feature being described in the sentence, in the phrase "sound quality," "sound" would be the aspect. M - Sentiment Modifier In the phrase "good sound quality," "good" would be the sentiment modifier.
    
    if token.dep_ == "amod" and not token.is_stop: # checks if the token has a dependency relation of "amod" (adjectival modifier) and is not a stop word.
        M = token.text # it updates M with the current token's text 
        A = token.head.text # and A with the head token's text - the aspect or feature being described in the sentence
        
        # add adverbial modifier of adjective (e.g. 'most comfortable headphones')
        M_children = [child_m.text for child_m in token.children if child_m.dep_ == "advmod"]
        if M_children:
            M = " ".join([M] + M_children)
            
        # negation in adjective, the "no" keyword is a determiners of the noun (e.g., no interesting characters) ; Determiners include articles (a, an, the), demonstratives (this, that, these, those), possessive pronouns (my, your, his, her, its, our, their), and other words that provide information about the noun.
        A_children = [child_a for child_a in token.head.children if child_a.dep_ == "det" and child_a.text == 'no']
        if A_children:
            neg_prefix = 'not'
            M = f"{neg_prefix} {M}"

    if A != "999999" and M != "999999":
        if A in prod_pronouns:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 1}]
    return []

In [None]:
def rule2(token):
    """
    Apply Rule 2: Extract aspect and its corresponding description from the review.
    
    When analyzing the structure of a sentence, Rule 2 focuses on the relationship between the nominal subject (A) and the direct object (M) of a verb. In this context:
    
    - A is the aspect or feature being described, and it is identified as the nominal subject (nsubj).
    - M is the sentiment modifier, and it is identified as the direct object (dobj) with an additional condition that its part-of-speech is an adjective (ADJ).

    The rule assumes that a verb in the sentence will have only one nominal subject and one direct object, and it leverages this relationship to extract aspects and their corresponding sentiment modifiers.
    
    Args:
        token (spacy.Token): The input token.

    Returns:
        list: A list containing a dictionary with the extracted aspect, sentiment modifier, and rule number (2).
    """
    A, M = "999999", "999999"
    add_neg_pfx = False

    # Iterate through the children of the token
    for child in token.children:
        # If the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # If the child is the direct object (dobj) and its part-of-speech is ADJ (adjective), and not a stop word
        if child.dep_ == "dobj" and child.pos_ == "ADJ" and not child.is_stop:
            M = child.text
    
            # If the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True
            
        # If negation is present, add the negation prefix to the sentiment modifier (M)
        if add_neg_pfx and M != "999999":
            M = f"{neg_prefix} {M}"
        
        # If both aspect (A) and sentiment modifier (M) are valid, format the result
        if A != "999999" and M != "999999":
            # If aspect is one of the specified pronouns, replace it with "product"
            if A in prod_pronouns:
                A = "product"
            return [{"noun": A, "adj": M, "rule": 2}]

        # If no valid aspect-sentiment pair is found, return an empty list
        return []

In [None]:
def rule3(token, prod_pronouns):
    """
    Apply Rule 3: Extract aspect and its corresponding description from the review.
    Adjectival Complement - A is a child of something with a relationship of nsubj, while M is a child of the same something with a relationship of acomp.

    Args:
        token (spacy.tokens.Token): The token to analyze.
        prod_pronouns (list): List of product pronouns.

    Returns:
        list: A list of dictionaries containing aspect, sentiment modifier, and rule number (3).
    """
    A, M = "999999", "999999"
    add_neg_pfx = False

    for child in token.children:
        # Check if the child is the nominal subject (nsubj) and not a stop word
        if child.dep_ == "nsubj" and not child.is_stop:
            A = child.text

        # Check if the child is the adjectival complement (acomp) and not a stop word
        if child.dep_ == "acomp" and not child.is_stop:
            M = child.text

        # Example: 'this could have been better' -> (this, not better)
        # If a child is an auxiliary (aux) with the tag "MD"  - Modal Auxiliary or a negation word (MD = "can," "could," "will," "would," "shall," "should," "may," "might," and "must.")
        if child.dep_ == "aux" and child.tag_ == "MD":
            neg_prefix = "not"
            add_neg_pfx = True

        # Check if the child is a negation word
        if child.dep_ == "neg":
            neg_prefix = child.text
            add_neg_pfx = True

    # If negation is present, add the negation prefix to the sentiment modifier (M)
    if add_neg_pfx and M != "999999":
        M = f"{neg_prefix} {M}"

    # If both aspect (A) and sentiment modifier (M) are valid, format the result
    if A != "999999" and M != "999999":
        # If aspect is one of the specified pronouns, replace it with "product"
        if A in prod_pronouns:
            A = "product"
        return [{"noun": A, "adj": M, "rule": 3}]

    # If no valid aspect-sentiment pair is found, return an empty list
    return []
