# Sentiment Analysis

## Reading Datafile 

In [1]:
import pandas as pd

# File path
file = ""C:\Users\mohan\Downloads\Special_Topics_DataScience/Statistics.csv"

# Attempt to read the file using different encodings
encodings = ['utf-8', 'utf-8-sig', 'latin1', 'ISO-8859-1']
for encoding in encodings:
    try:
        df = pd.read_csv(file, encoding=encoding)
        print("File read successfully with encoding:", encoding)
        break
    except UnicodeDecodeError:
        print("UnicodeDecodeError encountered with encoding:", encoding)
        continue

# Display the first 20 rows of the DataFrame
df.head(20)


UnicodeDecodeError encountered with encoding: utf-8
UnicodeDecodeError encountered with encoding: utf-8-sig
File read successfully with encoding: latin1


Unnamed: 0,wt_class,gender,genderN,year,event,rank,athlete,YOB,BirthYear,age,...,cj2,cj3,best_cj,t,tdiff,total_lifted,sinclair,snatch_sinclair,cj_sinclair,cluster
0,94,M,1,2015,45.2015,,adam wright,,,,...,,,,#VALUE!,#VALUE!,0.0,,,,1180
1,105,M,1,2015,45.2015,,michael ramage,,,,...,156.0,160.0,160.0,160,160,0.0,,,176.957229,1217
2,85,M,1,2015,45.2015,,marcus abiles,,,,...,-152.0,-155.0,,#VALUE!,#VALUE!,0.0,0.0,,,1139
3,85,M,1,2015,45.2015,,marcus abilies,,,,...,-152.0,-155.0,,#VALUE!,#VALUE!,0.0,,,,1139
4,105,M,1,2015,45.2015,,anthony sannella,,,,...,167.0,-172.0,167.0,167,167,0.0,,,184.296244,1219
5,94,M,1,2015,45.2015,,justin decker,,,,...,150.0,-166.0,150.0,150,150,0.0,,,171.313107,1183
6,105,M,1,2015,45.2015,,justin meyer,,,,...,-195.0,198.0,198.0,198,198,0.0,,,220.661214,1216
7,105,M,1,2015,45.2015,,david garcia,,,,...,,,,#VALUE!,#VALUE!,0.0,,,,1224
8,105,M,1,2015,45.2015,,colin burns,,,,...,,,,#VALUE!,#VALUE!,0.0,,,,1215
9,69,W,2,2015,45.2015,25.0,sesely omli,,,,...,-82.0,-82.0,78.0,141,0,141.0,178.413905,79.716851,98.697054,1338


In [2]:
from sklearn import preprocessing #Importing preprocessing module from sklearn
import nltk #Importing Natural Language Toolkit library 
nltk.download('opinion_lexicon')#Downloading the opinion lexicon dataset from NLTK
from nltk.corpus import opinion_lexicon#Importing the opinion_lexicon corpus, which contains positive&negative opinion words
from nltk.tokenize import word_tokenize#Importing word_tokenize function, which is used for tokenizing words

print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))#Printing total no of words in opinion lexicon
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10]) # Printing examples of positive words in the opinion lexicon
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10]) # Printing examples of negative words in the opinion lexicon

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\geeth\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [3]:
# Let's create a dictionary which we can use for scoring our review text

# Downloading punkt from NLTK library
nltk.download('punkt')

# Renaming the column 'reviewText' to 'Modules' in the DataFrame
df.rename(columns={"reviewText": "Modules"}, inplace=True)

# Assigning positive and negative scores
pos_score = 1
neg_score = -1

# Initializing an empty dictionary
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\geeth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#Creating a fuunction text
def bing_liu_score(Modules):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(Modules.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

In [6]:
# Fill NaN values in the 'text' column
df['athlete'].fillna('no review', inplace=True)
#creating new column 'Bing_Liu_Score' to store the scores by applying  bing_liu_score to calculate sentiment scores for Module column
df['Bing_Liu_Score'] = df['athlete'].apply(bing_liu_score)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['athlete'].fillna('no review', inplace=True)


In [8]:
import pandas as pd
import nltk
nltk.download('opinion_lexicon')
from nltk.tokenize import word_tokenize

# Function to calculate sentiment score using Bing Liu lexicon
def bing_liu_score(text):
    sentiment_score = 0
    bag_of_words = word_tokenize(str(text).lower())
    for word in bag_of_words:
        if word in word_dict:
            sentiment_score += word_dict[word]
    return sentiment_score

# File path
file_path = "C:/Users/geeth/OneDrive/Desktop/Special_Topics_DataScience/Statistics.csv"

# Attempt to read the file using different encodings
encodings = ['utf-8', 'utf-8-sig', 'latin1', 'ISO-8859-1']
for encoding in encodings:
    try:
        df = pd.read_csv(file_path, encoding=encoding)
        print("File read successfully with encoding:", encoding)
        break
    except UnicodeDecodeError:
        print("UnicodeDecodeError encountered with encoding:", encoding)
        continue

# Renaming the columns
df.rename(columns={
    'athlete': 'Athlete',
}, inplace=True)

# Fill NaN values in the 'Athlete' column without using inplace=True
df['Athlete'] = df['Athlete'].fillna('no review')

# Importing opinion lexicon and initializing sentiment dictionary
from nltk.corpus import opinion_lexicon
word_dict = {}
for word in opinion_lexicon.positive():
    word_dict[word] = 1
for word in opinion_lexicon.negative():
    word_dict[word] = -1

# Apply the function to calculate sentiment score for each athlete and create the 'Bing_Liu_Score' column
df['Bing_Liu_Score'] = df['Athlete'].apply(bing_liu_score)

# Display the first 5 rows of the DataFrame with 'Athlete' and 'Bing_Liu_Score' columns
print(df[['Athlete', 'Bing_Liu_Score']].head(15))


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\geeth\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


UnicodeDecodeError encountered with encoding: utf-8
UnicodeDecodeError encountered with encoding: utf-8-sig
File read successfully with encoding: latin1
                 Athlete  Bing_Liu_Score
0            adam wright               0
1         michael ramage               0
2          marcus abiles               0
3         marcus abilies               0
4       anthony sannella               0
5          justin decker               0
6           justin meyer               0
7           david garcia               0
8            colin burns              -1
9            sesely omli               0
10            mia hannah               0
11        roxy rodriguez               0
12           abby barron               0
13             kaia kong               0
14  amanda jo brensinger               0


In [10]:
# Displaying the first 5 rows of the DataFrame with few columns
df[['rank','snatch_sinclair','event',"Athlete", 'Bing_Liu_Score']].head(15)

Unnamed: 0,rank,snatch_sinclair,event,Athlete,Bing_Liu_Score
0,,,45.2015,adam wright,0
1,,,45.2015,michael ramage,0
2,,,45.2015,marcus abiles,0
3,,,45.2015,marcus abilies,0
4,,,45.2015,anthony sannella,0
5,,,45.2015,justin decker,0
6,,,45.2015,justin meyer,0
7,,,45.2015,david garcia,0
8,,,45.2015,colin burns,-1
9,25.0,79.716851,45.2015,sesely omli,0
