In [1]:
import os
import json
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from textstat import flesch_reading_ease
import nltk
from nltk.util import pairwise 
from vaderSentiment.vaderSentiment import NEGATE, BOOSTER_DICT
import re
import nltk.data
import editdistance

In [2]:
# Set Working directory
os.chdir('C:/Users/asus/Documents/GitHub/CMSC-197-Miniproject')

# Load The JSON file
with open('data/amazon_data.json', 'r') as f:
    reviews = json.load(f)

# DataFrame
reviews_df = pd.DataFrame(reviews)
reviews_df.head()

Unnamed: 0,Label,Review_Text,no_contract,Review_Text_str,tag_removed,number_removed,tokenized,lower,no_spec_char,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,0,"useful When least you think so, this product w...","[useful, When, least, you, think, so,, this, p...","useful When least you think so, this product w...","useful When least you think so, this product w...","useful When least you think so, this product w...","[useful, When, least, you, think, so, ,, this,...","[useful, when, least, you, think, so, ,, this,...","[useful, when, least, you, think, so, this, pr...","[useful, least, think, product, save, day, kee...","[[useful, JJ], [least, JJS], [think, JJ], [pro...","[[useful, a], [least, a], [think, a], [product...","[useful, least, think, product, save, day, kee..."
1,0,New era for batteries Lithium batteries are so...,"[New, era, for, batteries, Lithium, batteries,...",New era for batteries Lithium batteries are so...,New era for batteries Lithium batteries are so...,New era for batteries Lithium batteries are so...,"[New, era, for, batteries, Lithium, batteries,...","[new, era, for, batteries, lithium, batteries,...","[new, era, for, lithium, are, something, new, ...","[new, era, lithium, something, new, market, av...","[[new, JJ], [era, NN], [lithium, NN], [somethi...","[[new, a], [era, n], [lithium, n], [something,...","[new, era, lithium, something, new, market, av..."
2,0,doesn't swing very well. I purchased this swin...,"[does not, swing, very, well., I, purchased, t...",does not swing very well. I purchased this swi...,does not swing very well. I purchased this swi...,does not swing very well. I purchased this swi...,"[does, not, swing, very, well, ., I, purchased...","[does, not, swing, very, well, ., i, purchased...","[does, not, swing, very, well, i, this, swing,...","[swing, well, swing, baby, pretty, much, grown...","[[swing, VBG], [well, RB], [swing, VBG], [baby...","[[swing, v], [well, r], [swing, v], [baby, n],...","[swing, well, swing, baby, pretty, much, grow,..."
3,0,Great computing! I was looking for an inexpens...,"[Great, computing!, I, was, looking, for, an, ...",Great computing! I was looking for an inexpens...,Great computing! I was looking for an inexpens...,Great computing! I was looking for an inexpens...,"[Great, computing, !, I, was, looking, for, an...","[great, computing, !, i, was, looking, for, an...","[great, i, was, looking, for, an, inexpensive,...","[great, looking, inexpensive, desk, works, eve...","[[great, JJ], [looking, VBG], [inexpensive, JJ...","[[great, a], [looking, v], [inexpensive, a], [...","[great, look, inexpensive, desk, work, everyth..."
4,0,Only use twice a week I only use it twice a we...,"[Only, use, twice, a, week, I, only, use, it, ...",Only use twice a week I only use it twice a we...,Only use twice a week I only use it twice a we...,Only use twice a week I only use it twice a we...,"[Only, use, twice, a, week, I, only, use, it, ...","[only, use, twice, a, week, i, only, use, it, ...","[only, use, twice, a, week, i, only, use, it, ...","[use, twice, week, use, twice, week, great, us...","[[use, NN], [twice, RB], [week, NN], [use, NN]...","[[use, n], [twice, r], [week, n], [use, n], [t...","[use, twice, week, use, twice, week, great, us..."


In [3]:
# Quantity
result_quantity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = text.split('.')
    num_sentences = len(sentences)
    # Number of caps
    num_caps = sum(1 for c in text if c.isupper())
    # Number of punctuation
    num_punctuation = sum(text.count(p) for p in string.punctuation)
     # Part of speech
    pos_tags = nltk.pos_tag(words)
    noun_count = len([word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']])
    verb_count = len([word for word, pos in pos_tags if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']])
    adj_count = len([word for word, pos in pos_tags if pos in ['JJ', 'JJR', 'JJS']])
    adv_count = len([word for word, pos in pos_tags if pos in ['RB', 'RBR', 'RBS']])


    # Linguistic features results
    result_quantity.append({
        'Number_of_words': num_words,
        'Number_of_sentences': num_sentences,
        'Number_of_caps': num_caps,
        'Number_of_punctuation': num_punctuation,
        'Number_of_nouns': noun_count,
        'Number_of_verbs': verb_count,
        'Number_of_adjectives': adj_count,
        'Number_of_adverbs': adv_count
    })

# Result dataframe
VADER_quantity_df = pd.DataFrame(result_quantity)

# Dataframe to Json file
VADER_quantity_df.to_json('data/VADER_quantity.json', orient='records')

In [4]:
# Complexity

# Redundancy function
def calculate_redundancy(text):
    words = text.split()
    
    # Sum of Levenshtein distances between all pairs of words
    total_distance = sum(editdistance.eval(w1, w2) for i, w1 in enumerate(words) for j, w2 in enumerate(words) if i < j)
    
    # Average Levenshtein distance
    n = len(words)
    if n > 1:
        average_distance = total_distance / (n * (n - 1) / 2)
    else:
        average_distance = 0
    
    # Return the redundancy
    return 1 - average_distance / len(max(words, key=len))

results_complexity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    # Average word length
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / num_words
    # Average sentence length
    total_sentence_length = sum(len(sent) for sent in sentences)
    avg_sentence_length = total_sentence_length / num_sentences
    # Redundance score
    redundancy = calculate_redundancy(text)
    # Readability score
    readability_score = flesch_reading_ease(text)

    # linguistic features results
    results_complexity.append({
        'Average_word_length': avg_word_length,
        'Average_sentence_length': avg_sentence_length,
        'Redundancy_score': redundancy,
        'Readability_score': readability_score,
})

# Result dataframe
VADER_complexity_df = pd.DataFrame(results_complexity)

# Dataframe to Json file
VADER_complexity_df.to_json('data/VADER_complexity.json', orient='records')

In [5]:
# Diversity
results_diversity = []

for review in reviews:
    text = review['lemmatized']
    words = text
    num_words = len(words)
    # Lexical diversity
    unique_words = set(words)
    lexical_diversity = len(unique_words) / num_words

    # Linguistic features results
    results_diversity.append({
        'Lexical_diversity': lexical_diversity
    })


# Results dataframe
VADER_diversity_df = pd.DataFrame(results_diversity)

# Dataframe to Json file
VADER_diversity_df.to_json('data/VADER_diversity.json', orient='records')

In [6]:
# Emotion

# VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Sentiment score function
def get_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['compound'], scores['pos'], scores['neg'], scores['neu']

# Polarity function
def get_polarity_categories(text):
    words = text.split()
    polarities = {'pos': 0, 'neg': 0, 'neu': 0}
    for word in words:
        scores = sia.polarity_scores(word)
        for key in polarities.keys():
            if scores[key] > 0:
                polarities[key] += 1
    return polarities

# Polarity shifters function
def count_polarity_shifters(text):
    shifters = ['but', 'however', 'although', 'yet', 'nevertheless']
    count = 0
    for word in text:
        if word.lower() in shifters:
            count += 1
    return count

# Intensity modifiers function
intensity_modifier_words = BOOSTER_DICT
def count_intensity_modifiers(text):
    modifiers = intensity_modifier_words
    count = 0
    for word in text:
        if word.lower() in modifiers:
            count += 1
    return count

# Negations function
negation_words = NEGATE
def count_negations(text):
    negations = negation_words 
    count = 0
    for word in text:
        if word.lower() in negations:
            count += 1
    return count

def count_emoticons(text):
    sid = SentimentIntensityAnalyzer()
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    scores = sid.polarity_scores(text)
    num_emoticons = len(emoticons)
    return num_emoticons

# Dataframe
columns = ['Sentiment_score', 'Positive_score', 'Negative_score', 'Neutral_score',            
           'Positive_words', 'Negative_words', 'Neutral_words',           
           'Polarity_shifters', 'Intensity_modifiers', 'Negations', 'Emoticons']
VADER_emotion_df = pd.DataFrame(columns=columns)

# Extract features
for index, row in reviews_df.iterrows():
    text = row['lemmatized']
    text2 = row['Review_Text']
    sentiment_score, positive_score, negative_score, neutral_score = get_sentiment_scores(text2)
    polarities = get_polarity_categories(text2)
    polarity_shifters = count_polarity_shifters(text)
    intensity_modifiers = count_intensity_modifiers(text)
    negations = count_negations(text)
    emoticons = count_emoticons(text2)
    row_results = [sentiment_score, positive_score, negative_score, neutral_score, 
                   polarities['pos'], polarities['neg'], polarities['neu'],
                   polarity_shifters, intensity_modifiers, negations, emoticons]
    VADER_emotion_df.loc[index] = row_results

# Dataframe to Json file
VADER_emotion_df.to_json('data/VADER_emotion.json', orient='records')

In [7]:
# Set the working directory
os.chdir('C:/Users/asus/Documents/GitHub/CMSC-197-Miniproject/data')

# Load dataframes
VADER_quantity_df = pd.read_json('VADER_quantity.json')
VADER_complexity_df = pd.read_json('VADER_complexity.json')
VADER_diversity_df = pd.read_json('VADER_diversity.json')
VADER_emotion_df = pd.read_json('VADER_emotion.json')

# Concatenate dataframes
VADER_df = pd.concat([reviews_df[['Label']], VADER_quantity_df, VADER_complexity_df, VADER_diversity_df, VADER_emotion_df], axis=1)

# Dataframe to Json file
VADER_df.to_json('VADER.json', orient='records')

In [8]:
# Rename the labels
VADER_df['Label'] = VADER_df['Label'].replace({'0': 'fake', '1': 'real'})

# Group the data by label and calculate statistics
VADER_statistics = VADER_df.groupby('Label').describe().transpose()

# Print statistics
print(VADER_statistics)

VADER_statistics.to_excel('VADER_statistics.xlsx')

Label                             0             1
Number_of_words count  10500.000000  10500.000000
                mean      63.690190     84.211714
                std       58.985292    106.423050
                min       16.000000     16.000000
                25%       35.000000     32.000000
...                             ...           ...
Emoticons       min        0.000000      0.000000
                25%        0.000000      0.000000
                50%        0.000000      0.000000
                75%        0.000000      0.000000
                max        4.000000      6.000000

[192 rows x 2 columns]


In [9]:
from sklearn.preprocessing import MinMaxScaler
 
# Select the columns to normalize
columns_to_normalize = ['Positive_words', 'Negative_words', 'Neutral_words',           
                        'Polarity_shifters', 'Intensity_modifiers', 'Negations', 'Emoticons',
                        'Average_word_length', 'Average_sentence_length', 'Number_of_words', 'Number_of_sentences',
                        'Number_of_caps', 'Number_of_punctuation', 'Number_of_nouns',
                        'Number_of_verbs', 'Number_of_adjectives', 'Number_of_adverbs']

scaler = MinMaxScaler()
# Normalize
VADER_df[columns_to_normalize] = scaler.fit_transform(VADER_df[columns_to_normalize])

# Dataframe to Json file
VADER_df.to_json('VADER_normalized.json', orient='records')
print(VADER_df.head())

   Label  Number_of_words  Number_of_sentences  Number_of_caps  \
0      0         0.002822             0.010152        0.001144   
1      0         0.020106             0.015228        0.002859   
2      0         0.014109             0.030457        0.003431   
3      0         0.010582             0.020305        0.004002   
4      0         0.019400             0.015228        0.002859   

   Number_of_punctuation  Number_of_nouns  Number_of_verbs  \
0               0.005357         0.005908         0.009615   
1               0.012500         0.026588         0.025000   
2               0.017857         0.013294         0.023077   
3               0.008929         0.013294         0.019231   
4               0.010714         0.023634         0.019231   

   Number_of_adjectives  Number_of_adverbs  Average_word_length  ...  \
0              0.007663           0.009174             0.123009  ...   
1              0.038314           0.022936             0.200208  ...   
2             

In [10]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

lemmatized_texts_as_strings = [' '.join(doc) for doc in reviews_df['lemmatized']]

# Fit and transform the text data
bow_matrix = vectorizer.fit_transform(lemmatized_texts_as_strings)

# Vocabulary (words and their indices)
print(vectorizer.vocabulary_)  # Dictionary: word -> column index

# Shape of the BoW matrix
print(bow_matrix.shape)  # number of documents, vocabulary size

(21000, 14131)


In [11]:
# List of words in the vocabulary
words = vectorizer.get_feature_names_out()
print(words)

['aa' 'aback' 'abalone' ... 'zoology' 'zoom' 'zucchini']


In [12]:
# Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Step 1: Create BoW matrix
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(lemmatized_texts_as_strings)

# Step 2: Transform BoW to TF-IDF
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix)

# Print the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Step 3: Convert TF-IDF to dense format for inspection
dense_tfidf = tfidf_matrix.toarray()

# Step 4: Display results
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)
print("TF-IDF Dense Matrix:\n", dense_tfidf)

TF-IDF matrix shape: (21000, 14131)
Feature Names: ['aa' 'aback' 'abalone' ... 'zoology' 'zoom' 'zucchini']
TF-IDF Dense Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
# Convert the sparse TF-IDF matrix to a dense array
dense_tfidf = tfidf_matrix.toarray()

# Create a DataFrame with the feature names as columns
tfidf_df = pd.DataFrame(dense_tfidf, columns=vectorizer.get_feature_names_out())

# Save the DataFrame as a CSV (optional)
tfidf_df.to_csv('tfidf.csv', index=False)

# Display the DataFrame (optional)
print(tfidf_df.head())

    aa  aback  abalone  abandon  abbey  abdomen  abduction  aberrant  \
0  0.0    0.0      0.0      0.0    0.0      0.0        0.0       0.0   
1  0.0    0.0      0.0      0.0    0.0      0.0        0.0       0.0   
2  0.0    0.0      0.0      0.0    0.0      0.0        0.0       0.0   
3  0.0    0.0      0.0      0.0    0.0      0.0        0.0       0.0   
4  0.0    0.0      0.0      0.0    0.0      0.0        0.0       0.0   

   aberration  abhor  ...  zircon  zirconia  zodiac  zombie  zone  zoo  \
0         0.0    0.0  ...     0.0       0.0     0.0     0.0   0.0  0.0   
1         0.0    0.0  ...     0.0       0.0     0.0     0.0   0.0  0.0   
2         0.0    0.0  ...     0.0       0.0     0.0     0.0   0.0  0.0   
3         0.0    0.0  ...     0.0       0.0     0.0     0.0   0.0  0.0   
4         0.0    0.0  ...     0.0       0.0     0.0     0.0   0.0  0.0   

   zoologist  zoology  zoom  zucchini  
0        0.0      0.0   0.0       0.0  
1        0.0      0.0   0.0       0.0  
2 

In [16]:
# Combine the features in one file
combined_df = pd.concat([VADER_df, tfidf_df], axis=1)
combined_df.to_csv("finalized_data.csv", index=False)
print(combined_df.head())

   Label  Number_of_words  Number_of_sentences  Number_of_caps  \
0      0         0.002822             0.010152        0.001144   
1      0         0.020106             0.015228        0.002859   
2      0         0.014109             0.030457        0.003431   
3      0         0.010582             0.020305        0.004002   
4      0         0.019400             0.015228        0.002859   

   Number_of_punctuation  Number_of_nouns  Number_of_verbs  \
0               0.005357         0.005908         0.009615   
1               0.012500         0.026588         0.025000   
2               0.017857         0.013294         0.023077   
3               0.008929         0.013294         0.019231   
4               0.010714         0.023634         0.019231   

   Number_of_adjectives  Number_of_adverbs  Average_word_length  ...  zircon  \
0              0.007663           0.009174             0.123009  ...     0.0   
1              0.038314           0.022936             0.200208  ...  