# Sentiment Analysis

## Reading Datafile

In [2]:
import pandas as pd   
df = pd.read_csv('Lego Data.csv') 
df.sample(10) 

Unnamed: 0,Set_ID,Name,Year,Theme,Theme_Group,Subtheme,Category,Packaging,Num_Instructions,Availability,Pieces,Minifigures,Owned,Rating,USD_MSRP,Total_Quantity,Current_Price
2508,70150-1,Flaming Claws,2014,Legends of Chima,Action/Adventure,Speedorz,Normal,Box with backing card,2,Retail,74.0,1.0,705,0.0,12.99,3,10.0
603,7311-1,Red Planet Cruiser,2001,Space,Action/Adventure,Life On Mars,Normal,Box,1,Retail,73.0,1.0,2947,3.5,,3,5.0
1579,8195-1,Turbo Tow,2010,Racers,Racing,Tiny Turbos,Normal,Canister,1,Retail,43.0,,1881,3.6,5.99,1,7.3361
1711,7067-1,Jet-Copter Encounter,2011,Space,Action/Adventure,Alien Conquest,Normal,Box,2,Retail,375.0,3.0,2691,4.0,39.99,5,58.99
4855,41923-1,Monster Bracelets,2021,Dots,Art and crafts,Bracelets,Normal,Foil pack,0,Retail,34.0,,608,0.0,5.99,3,2.98
3043,41120-1,Adventure Camp Archery,2016,Friends,Modern day,Adventure Camp,Normal,Box,2,Retail,114.0,1.0,2030,3.4,9.99,2,14.99
2123,41001-1,Mia's Magic Tricks,2013,Friends,Modern day,Pets,Normal,Box,2,Retail,90.0,1.0,3982,3.4,9.99,3,7.99
2456,45570-1,Space Challenge Set,2014,Education,Educational,Mindstorms,Normal,{Not specified},0,{Not specified},1417.0,,50,0.0,,1,450.0
934,7016-1,Viking Boat against the Wyvern Dragon,2005,Vikings,Historical,,Normal,Box,2,Retail - limited,112.0,2.0,3376,4.1,,10,49.9288
4321,10273-1,Haunted House,2020,Icons,Model making,Fairground Collection,Normal,Box,8,LEGO exclusive,3231.0,10.0,10378,4.4,299.99,6,250.0


## Exploring Opinion Lexicon in NLTK Library

In [3]:
from sklearn import preprocessing #Importing preprocessing module from sklearn
import nltk #Importing Natural Language Toolkit library 
nltk.download('opinion_lexicon')#Downloading the opinion lexicon dataset from NLTK
from nltk.corpus import opinion_lexicon#Importing the opinion_lexicon corpus, which contains positive&negative opinion words
from nltk.tokenize import word_tokenize#Importing word_tokenize function, which is used for tokenizing words

print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))#Printing total no of words in opinion lexicon
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10]) # Printing examples of positive words in the opinion lexicon
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10]) # Printing examples of negative words in the opinion lexicon

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


## Creation of Dictionary for Sentiment Analysis

In [4]:
# Let's create a dictionary which we can use for scoring our review text

# Downloading punkt from NLTK library
nltk.download('punkt')

# Renaming the column 'reviewText' to 'Modules' in the DataFrame
df.rename(columns={"reviewText": "Modules"}, inplace=True)

# Assigning positive and negative scores
pos_score = 1
neg_score = -1

# Initializing an empty dictionary
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Calculating Sentiment Score with Bing Liu Lexicon

In [5]:
#Creating a fuunction text
def bing_liu_score(Modules):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(Modules.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

In [18]:
import pandas as pd

# Read the dataset into a DataFrame
df = pd.read_csv('Lego Data.csv')

# Display the first few rows of the DataFrame
print(df.head())

# Displaying the first 5 rows of the DataFrame with selected columns
selected_columns = ['Schedule', 'Rating', 'Modules', 'Bing_Liu_Score']

# Check if all selected columns exist in the DataFrame
if all(col in df.columns for col in selected_columns):
    # Display the first 5 rows with the selected columns
    df_selected = df[selected_columns].head(5)
    print(df_selected)
else:
    # Print a message if one or more selected columns are not found
    print("One or more selected columns not found in the DataFrame.")



  Set_ID                 Name  Year     Theme Theme_Group         Subtheme  \
0  659-1        Police Patrol  1975  LEGOLAND     Vintage              NaN   
1  314-1        Police Launch  1976  LEGOLAND     Vintage            Boats   
2  369-1  Coast Guard Station  1976  LEGOLAND     Vintage         Building   
3  485-1           Fire Truck  1976  LEGOLAND     Vintage              NaN   
4  787-1        Storage Cloth  1977     Basic       Basic  Supplementaries   

  Category        Packaging  Num_Instructions     Availability  Pieces  \
0   Normal  {Not specified}                 0  {Not specified}    49.0   
1   Normal  {Not specified}                 0  {Not specified}    53.0   
2   Normal  {Not specified}                 0  {Not specified}   275.0   
3   Normal  {Not specified}                 0  {Not specified}    72.0   
4   Normal  {Not specified}                 0  {Not specified}     1.0   

   Minifigures  Owned  Rating  USD_MSRP  Total_Quantity  Current_Price  
0          2.

## Calculating Mean Sentiment Score

In [10]:
import pandas as pd

# Read the dataset into a DataFrame
df = pd.read_csv('Lego Data.csv')

# Display a random sample of 10 rows from the dataset
print(df.sample(10))

# Check if 'Modules' column exists in the DataFrame
if 'Modules' in df.columns:
    # Fill NaN values in the 'Modules' column with 'no review'
    df['Modules'].fillna('no review', inplace=True)

    # Assuming you have defined the bing_liu_score function elsewhere in your code
    # Create a new column 'Bing_Liu_Score' to store sentiment scores by applying bing_liu_score to the 'Modules' column
    df['Bing_Liu_Score'] = df['Modules'].apply(bing_liu_score)
else:
    print("The 'Modules' column does not exist in the DataFrame.")



       Set_ID                            Name  Year         Theme  \
2384  40182-1         Bricktober Fire Station  2014   Promotional   
1928   9478-1             Francesco Bernoulli  2012          Cars   
213    6666-1                       Ambulance  1994          Town   
3838  60195-1  Arctic Mobile Exploration Base  2018          City   
249    1747-1               Treasure Surprise  1996       Pirates   
330    8437-1                      Future Car  1997       Technic   
4691  10945-1     Garbage Truck and Recycling  2021         Duplo   
2949  21031-1                    Burj Khalifa  2016  Architecture   
289    2181-1                      Infomaniac  1997          Town   
2410  41054-1     Rapunzel's Creativity Tower  2014        Disney   

        Theme_Group         Subtheme Category Packaging  Num_Instructions  \
2384  Miscellaneous        Toys R Us   Normal       Box                 1   
1928       Licensed           Cars 2   Normal       Box                 2   
213      

In [37]:
df.columns

Index(['Set_ID', 'Name', 'Year', 'Theme', 'Theme_Group', 'Subtheme',
       'Category', 'Packaging', 'Num_Instructions', 'Availability', 'Pieces',
       'Minifigures', 'Owned', 'Rating', 'USD_MSRP', 'Total_Quantity',
       'Current_Price'],
      dtype='object')

In [39]:
# Assuming you have a column named 'Description' containing textual data related to LEGO themes
# You need to preprocess the text data, remove stopwords, and perform sentiment analysis
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.util import mark_negation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Preprocess text and perform sentiment analysis
stop_words = set(stopwords.words('english'))
sid = SentimentIntensityAnalyzer()

# Function to calculate sentiment score for a given text
def calculate_sentiment(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokenized_text = mark_negation(tokens)
    # Calculate sentiment score using VADER sentiment analyzer
    sentiment_score = sid.polarity_scores(' '.join(tokenized_text))['compound']
    return sentiment_score

# Apply sentiment analysis to each theme description
df['Sentiment_Score'] = df['Theme'].apply(calculate_sentiment)

# Aggregate sentiment scores by theme
theme_sentiment_scores = df.groupby('Theme')['Sentiment_Score'].mean()
print(theme_sentiment_scores)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Theme
4 Juniors          0.0000
Advanced models    0.2500
Adventurers        0.2263
Agents             0.0000
Alpha Team         0.0000
                    ...  
Western            0.0000
World City         0.0000
World Racers       0.0000
Xtra               0.0000
Znap               0.0000
Name: Sentiment_Score, Length: 130, dtype: float64


In [41]:
# Define a function to calculate sentiment score using Bing Liu Lexicon
def calculate_bing_liu_score(text):
    # Implement your logic to calculate the sentiment score using Bing Liu Lexicon
    # This function should return the sentiment score for the given text
    # Replace this placeholder code with your actual implementation
    return 0  # Placeholder value, replace it with your actual calculation

# Apply sentiment analysis to each row of the 'Name' column
df['Bing_Liu_Score'] = df['Name'].apply(calculate_bing_liu_score)

# Print the DataFrame with the calculated Bing Liu Score
print(df[['Name', 'Bing_Liu_Score']])


                                  Name  Bing_Liu_Score
0                        Police Patrol               0
1                        Police Launch               0
2                  Coast Guard Station               0
3                           Fire Truck               0
4                        Storage Cloth               0
...                                ...             ...
5437         Temple of the Golden Idol               0
5438           Monkie Kid's Combi Mech               0
5439        Master Wu vs. Ghost Archer               0
5440  Cave Explorer, Creeper and Slime               0
5441                     Miles Morales               0

[5442 rows x 2 columns]


In [42]:
df[['Theme','Rating',"Name", 'Bing_Liu_Score']].head(5)

Unnamed: 0,Theme,Rating,Name,Bing_Liu_Score
0,LEGOLAND,3.7,Police Patrol,0
1,LEGOLAND,3.2,Police Launch,0
2,LEGOLAND,0.0,Coast Guard Station,0
3,LEGOLAND,0.0,Fire Truck,0
4,Basic,0.0,Storage Cloth,0


In [43]:
df.groupby('Theme').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Theme,Unnamed: 1_level_1
4 Juniors,0.0
Advanced models,0.0
Adventurers,0.0
Agents,0.0
Alpha Team,0.0
...,...
Western,0.0
World City,0.0
World Racers,0.0
Xtra,0.0


In [44]:
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize

# Load the Bing Liu Lexicon
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

# Sample LEGO theme data (replace this with your dataset)
theme_data = {
    'Theme': ['4 Juniors', 'Advanced models', 'Adventurers', 'Agents', 'Alpha Team', 'Western', 'World City', 'World Racers', 'Xtra', 'Znap']
}

# Create a DataFrame from the theme data
df_themes = pd.DataFrame(theme_data)

# Function to calculate sentiment score using Bing Liu Lexicon
def calculate_sentiment(theme_name):
    tokens = word_tokenize(theme_name.lower())
    positive_count = sum(1 for word in tokens if word in positive_words)
    negative_count = sum(1 for word in tokens if word in negative_words)
    return positive_count - negative_count

# Apply sentiment analysis to each LEGO theme
df_themes['Bing_Liu_Score'] = df_themes['Theme'].apply(calculate_sentiment)

# Print the DataFrame with sentiment scores
print(df_themes)


             Theme  Bing_Liu_Score
0        4 Juniors               0
1  Advanced models               1
2      Adventurers               0
3           Agents               0
4       Alpha Team               0
5          Western               0
6       World City               0
7     World Racers               0
8             Xtra               0
9             Znap               0


In [49]:
# Calculate the mean of the 'Bing_Liu_Score' column
bing_liu_score_mean = df['Bing_Liu_Score'].mean()

# Print the mean score
print("Mean Bing Liu Score:", bing_liu_score_mean)



Mean Bing Liu Score: 0.0


In [54]:
df.groupby('Theme').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Theme,Unnamed: 1_level_1
4 Juniors,0.0
Advanced models,0.0
Adventurers,0.0
Agents,0.0
Alpha Team,0.0
...,...
Western,0.0
World City,0.0
World Racers,0.0
Xtra,0.0


In [55]:
# Calculate the mean sentiment score for the entire dataset
mean_sentiment_score = df['Bing_Liu_Score'].mean()

print("Mean Sentiment Score:", mean_sentiment_score)

Mean Sentiment Score: 0.0
