## 2 Setup

### 2.1 Import and configure libraries 

In [62]:
# Data manipulation libraries
import pandas as pd
import spacy
from spacy import displacy # is used for visualizing the dependency parse tree and named entity recognition (NER) annotations.

# General Imports
import time

# Data modeling libraries
from sklearn.model_selection import train_test_split

# text processing and cleaning
import re # This line imports the regular expression (regex) module, which provides functions for working with regular expressions. 
import nltk
from nltk.stem import WordNetLemmatizer #  is used for lemmatization, which is the process of reducing words to their base or root form: from running to run
from nltk.corpus import stopwords #The stopwords corpus from NLTK contains common words that are often removed from text during text preprocessing/ These words (like 'and', 'the', 'is', etc.) are considered as noise in many natural language processing tasks 

# Display in jupyter
# from IPython.core.display import display, HTML
# # Set the width of the output cell
# display(HTML("<style>.container { width:80% !important; }</style>"))


### 2.2 Constants and helper functions 

#### Constants

#### Helper functions

In [63]:
def load_dataset_from_json(json_file_path):
    """
    :param json_file_path (str) :Path to the JSON file.
    :return: pd.DataFrame: DataFrame containing the loaded data.
    """
    df = pd.read_json(json_file_path)
    return df


### 2.3 Set dataframe 

In [64]:
df = load_dataset_from_json("/kaggle/input/amazon-one-plus-reviews/amazon_one_plus_reviews.json")
df.head(3)

Unnamed: 0,product,product_company,profile_name,review_title,review_rating,review_text,helpful_count,total_comments,review_country,reviewed_at,url,crawled_at,_id,verified_purchase,color,style_name,size_name,category,sub_category,images
0,"OnePlus Nord 5G (Gray Onyx, 8GB RAM, 128GB Sto...",OnePlus,Nikhil,*Read before you buy!!*,5.0 out of 5 stars,"\n Yea..pre-ordered on 28 July, got it on 4 A...",721 people found this helpful,3,India,2020-08-04,https://www.amazon.in/product-reviews/B08695ZS...,2020-10-18 11:23:47,45ca015a-2e39-5650-a174-ba966dd1e51f,Verified Purchase,Marble Blue,8GB RAM + 128GB Storage,na,electronics,mobiles,[https://images-na.ssl-images-amazon.com/image...
1,"OnePlus Nord 5G (Gray Onyx, 8GB RAM, 128GB Sto...",OnePlus,Amit,Near to mid range Perfection,5.0 out of 5 stars,"\n Got it delivered yesterday , used for abou...",436 people found this helpful,1,India,2020-08-03,https://www.amazon.in/product-reviews/B08695ZS...,2020-10-18 11:23:47,6820f8ae-f3cd-5783-b826-5e5805376047,Verified Purchase,na,na,na,electronics,mobiles,[]
2,"OnePlus Nord 5G (Gray Onyx, 8GB RAM, 128GB Sto...",OnePlus,aishwarya,Great price!,5.0 out of 5 stars,\n An amazing phone!,322 people found this helpful,1,India,2020-08-04,https://www.amazon.in/product-reviews/B08695ZS...,2020-10-18 11:23:47,bea91c43-8d65-5ef5-ab04-595a5f5cc542,Verified Purchase,na,na,na,electronics,mobiles,[https://images-na.ssl-images-amazon.com/image...


In [65]:
# conda update --all

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30612 entries, 0 to 30611
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   product            30612 non-null  object        
 1   product_company    30612 non-null  object        
 2   profile_name       30612 non-null  object        
 3   review_title       30612 non-null  object        
 4   review_rating      30612 non-null  object        
 5   review_text        30612 non-null  object        
 6   helpful_count      30612 non-null  object        
 7   total_comments     30612 non-null  int64         
 8   review_country     30612 non-null  object        
 9   reviewed_at        30612 non-null  datetime64[ns]
 10  url                30612 non-null  object        
 11  crawled_at         30612 non-null  datetime64[ns]
 12  _id                30612 non-null  object        
 13  verified_purchase  30612 non-null  object        
 14  color 

In [67]:
df.describe(include='all')

Unnamed: 0,product,product_company,profile_name,review_title,review_rating,review_text,helpful_count,total_comments,review_country,reviewed_at,url,crawled_at,_id,verified_purchase,color,style_name,size_name,category,sub_category,images
count,30612,30612,30612,30612,30612,30612,30612.0,30612.0,30612,30612,30612,30612,30612,30612,30612,30612,30612,30612,30612,30612
unique,3,2,23355,17655,5,25010,126.0,,4,,3067,,24764,2,11,6,2,1,1,4390
top,"Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Stor...",Redmi,Amazon Customer,Good,5.0 out of 5 stars,\n Good\n,0.0,,India,,https://www.amazon.in/product-reviews/B08695ZS...,,a9082ff4-03d5-52d9-ae5e-72c772c6e892,Verified Purchase,na,na,na,electronics,mobiles,[]
freq,13934,21143,2040,1621,13967,1072,26507.0,,30609,,10,,1083,29999,9931,20930,19613,30612,30612,26170
mean,,,,,,,,0.072325,,2020-05-30 02:35:19.639357184,,2020-10-18 15:01:52.123643904,,,,,,,,
min,,,,,,,,0.0,,2019-11-06 00:00:00,,2020-10-18 11:23:47,,,,,,,,
25%,,,,,,,,0.0,,2020-03-02 00:00:00,,2020-10-18 11:37:10,,,,,,,,
50%,,,,,,,,0.0,,2020-07-02 00:00:00,,2020-10-18 16:13:53,,,,,,,,
75%,,,,,,,,0.0,,2020-08-29 00:00:00,,2020-10-18 16:26:17,,,,,,,,
max,,,,,,,,24.0,,2020-10-18 00:00:00,,2020-10-18 17:38:56,,,,,,,,


In [68]:
# The dataset from worlbank contains information about 3 types of produc in our application we need for startjust one product
df['product'].value_counts()

product
Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage)                                                                              13934
OnePlus Nord 5G (Gray Onyx, 8GB RAM, 128GB Storage)                                                                              9469
Redmi Note 9 Pro (Aurora Blue, 4GB RAM, 64GB Storage) - Latest 8nm Snapdragon 720G & Alexa Hands-Free | 6 Months No Cost EMI     7209
Name: count, dtype: int64

In [69]:
mask = df['product'] == 'Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage)'
df = df[mask].reset_index(drop=True)
df = df[['reviewed_at', 'review_text', 'review_title']]
df = df.rename(columns = {'review_title' : 'Summary', 'review_text' : 'Review', 'reviewed_at' : 'Date'})

In [70]:
# Format the date as "30 August 2021"
df['Date'] = df['Date'].apply(lambda x: x.strftime('%d %B %Y') if not pd.isnull(x) else '')

### 3 EDA

##### For this analyse I will use the product named Redmi Note 8 (Neptune Blue, 4GB RAM, 64GB Storage) 13934


In [71]:
df.head(3)

Unnamed: 0,Date,Review,Summary
0,06 November 2019,\n Febulas performance Redmi Note 8 ...I love...,Superb ...😘
1,06 November 2019,\n Loving the phone....Purchased with bank di...,Worth the price....A must buy for everyone wit...
2,10 November 2019,\n best mobile under 10000\n,moonlight is love


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13934 entries, 0 to 13933
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     13934 non-null  object
 1   Review   13934 non-null  object
 2   Summary  13934 non-null  object
dtypes: object(3)
memory usage: 326.7+ KB


### 4 Training Setup

### 4.1 Train, Test split

##### Train, Test split

In [73]:
X = df.drop(["Summary"], axis =1 )
X_train, X_test = train_test_split(X , test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((11147, 2), (2787, 2))

##### Sentence Tokenization


In [74]:
def split_review_custom_delimiters(text):
    """
    This function splits the review into multiple sentences based on custom delimiters.
    """
    delimiters = ".", "but", "and", "also"
    escaped_delimiters = map(re.escape, delimiters) # Result: ['\\.', 'but', 'and', 'also']
    regex_pattern = '|'.join(escaped_delimiters) # Applying the custom delimiters # Result: '\\.|but|and|also'
    splitted = re.split(regex_pattern, text) # Splitting the review function from the re module to split the input text into a list of substrings based on the specified regular expression pattern.
    return[sentence.strip() for sentence in splitted if sentence.strip()] #this line ensures that only non-empty sentences (after stripping whitespaces) are included in the final result.  sentence.strip(): Strips any leading or trailing whitespaces from the sentence.

##### Data cleaning

In [75]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
lemma = WordNetLemmatizer()
all_stopwords = set(stopwords.words('english'))

custom_stopwords = ['not', 'but', 'because', 'against', 'between', 'up', 'down', 'in', 'out', 'once', 'before','after', 'few', 'more', 'most', 'no', 'nor', 'same', 'some']

for stopword in custom_stopwords:
    all_stopwords.remove(stopword)

def clean_aspect_spacy(reviews):
    """
    this function prepares text for analysis by cleaning it, making it more consistent, 
    and removing elements that may not carry substantial meaning for downstream tasks in natural language processing as punctuations, stopwords, and other non-alphanumeric characters.
    It expands contractions and replaces some words with an empty string.
    """
    text = reviews.lower()
    
    contractions = {
        "won't": "will not",
        "cannot": "can not",
        "can't": "can not",
        "n't": " not",
        "what's": "what is",
        "it's": "it is",
        "'ve": " have",
        "i'm": "i am",
        "'re": " are",
        "he's": "he is",
        "she's": "she is",
        "*****": " ",        
    }
    
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
        
    # Remove special characters, numbers, and extra spaces.
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(' +', ' ', text)
    
    # Lemmatization and removing stopwords
    words = text.split()
    cleaned_words = [lemma.lemmatize(word) for word in words if word not in set(all_stopwords)]
    
    # Join the cleaned words back into a sentence
    cleaned_text = ' '.join(cleaned_words) 
    
    return cleaned_text

##### Reconstruct the DataFrame

In [77]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [85]:
# pip install --upgrade nltk

In [79]:
def split_and_clean_reviews(df):
    """
    This function takes a DataFrame 'df' with 'Review' and 'Date' columns, splits each review into smaller components,
    filters out components with fewer than three words, and applies a text cleaning function to each split and cleaned review.
    
    Parameters:
    - df (DataFrame): Input DataFrame with 'Review' and 'Date' columns.

    Returns:
    DataFrame: A new DataFrame with 'Date' and 'Review' columns, where each review has been split, filtered, and cleaned.
    """
    reviews = []
    dates = []
    
    for i, review_text in enumerate(df["Review"].values):
        review_split = split_review_custom_delimiters(review_text)
        
        # Filter out components with fewer than three words
        review_split_filtered = [split for split in review_split if len(split.split()) >= 3]
    
        # Duplicate dates as string for the corresponding split reviews
        duplicate_dates = [str(df["Date"].values[i]) for _ in range(len(review_split_filtered))]

        # Extend the lists with split and duplicated reviews and dates
        reviews.extend(review_split_filtered)
        dates.extend(duplicate_dates)

    # Apply the text cleaning function to each split and cleaned review    
    cleaned_reviews = [clean_aspect_spacy(text) for text in reviews]
    
    # Create a new DataFrame with 'Date' and 'Review' columns
    data = pd.DataFrame({"Date": dates, "Review": cleaned_reviews})
    
    return data

In [80]:
# pip install --upgrade nltk

In [81]:
# Train set

start_time = time.time()
train_data = split_and_clean_reviews(X_train)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

The time difference is: 1.4473881721496582 seconds


In [82]:
# Test set

start_time = time.time()
test_data = split_and_clean_reviews(X_test)
elapsed_time = time.time() - start_time

print(f"The time difference is: {elapsed_time} seconds")

The time difference is: 0.3462095260620117 seconds


In [83]:
train_data.head(3)

Unnamed: 0,Date,Review
0,07 January 2020,casual user cell phone note best even camers q...
1,07 January 2020,price point give u technology in single h
2,07 January 2020,according casual need face unlock work well in...


In [84]:
test_data.head(3)

Unnamed: 0,Date,Review
0,09 March 2020,camera quality not good
1,25 December 2019,great mobile affordable price
2,25 December 2019,highly recommending buy without thinking twice


##### Aspect extraction -  process of identifying and extracting specific aspects, features, or attributes from textual data

In [None]:
def apply_extraction(row, nlp):
    """
    This function extracts aspect and its corresponding description from the review by 
    applying 7 different rules of POS tagging.
    """
    prod_pronouns = ['it', 'this', 'they', 'these']
    review_body = row['Review']
    doc = nlp(review_body)
    
    aspect_pairs = []
    
    for token in doc:
        # Rule 1
        aspect_pairs.extend(rule1(token, prod_pronouns))
        
        # Rule 2
        aspect_pairs.extend(rule2(token, prod_pronouns))
        
        # Rule 3
        aspect_pairs.extend(rule3(token, prod_pronouns))
        
        # Rule 4
        aspect_pairs.extend(rule4(token, prod_pronouns))
        
        # Rule 5
        aspect_pairs.extend(rule5(token, prod_pronouns))
        
        # Rule 6
        aspect_pairs.extend(rule6(token, prod_pronouns))
        
        # Rule 7
        aspect_pairs.extend(rule7(token, prod_pronouns))
        
    return {"aspect_pairs": aspect_pairs}

In [None]:
def rule1(token, prod_pronouns):
    """
        Apply Rule 1: Extract aspect and its corresponding description from the review.
    """
    A, M = "999999", "999999" # A - the aspect or feature being described in the sentence, in the phrase "sound quality," "sound" would be the aspect. M - Sentiment Modifier In the phrase "good sound quality," "good" would be the sentiment modifier.
    
    77
    