In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [22]:
cleaned_imdb_datasets = pd.read_csv('../data/processed/Cleaned IMDB Dataset.csv')
cleaned_imdb_datasets.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"petter mattei's ""love in the time of money"" is...",positive,petter matteis love time money visually stunni...
5,"probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,i sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"this show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,encouraged by the positive comments about this...,negative,encouraged positive comment film looking forwa...
9,if you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


In [None]:
encoder = LabelEncoder()
encoded_sentiment = encoder.fit_transform(cleaned_imdb_datasets['sentiment'])
# print out the encoded values and its corresponding sentiment
print(list(zip(encoded_sentiment, cleaned_imdb_datasets['sentiment'])))

In [24]:
cleaned_imdb_datasets['sentiment'] = encoded_sentiment
cleaned_imdb_datasets.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,one of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode you...
1,a wonderful little production. <br /><br />the...,1,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,0,basically there family little boy jake think t...
4,"petter mattei's ""love in the time of money"" is...",1,petter matteis love time money visually stunni...
5,"probably my all-time favorite movie, a story o...",1,probably alltime favorite movie story selfless...
6,i sure would like to see a resurrection of a u...,1,sure would like see resurrection dated seahunt...
7,"this show was an amazing, fresh & innovative i...",0,show amazing fresh innovative idea first aired...
8,encouraged by the positive comments about this...,0,encouraged positive comment film looking forwa...
9,if you like original gut wrenching laughter yo...,1,like original gut wrenching laughter like movi...


## keep only words with non-neutral sentiment

In [25]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sentiment_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [26]:
def keep_sentiment_words(text):
    sentiment_words = []
    for word in text.split():
        polarity = sentiment_analyzer.polarity_scores(word)['compound']
        if abs(polarity) > 0.1:
            sentiment_words.append(word)
    return ' '.join(sentiment_words)


In [27]:
cleaned_imdb_datasets['sentiment_text'] = cleaned_imdb_datasets['cleaned_review'].apply(keep_sentiment_words)
cleaned_imdb_datasets.head(10)

Unnamed: 0,review,sentiment,cleaned_review,sentiment_text
0,one of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode you...,struck brutality violence trust timid violence...
1,a wonderful little production. <br /><br />the...,1,wonderful little production filming technique ...,wonderful comforting discomforting well truly ...
2,i thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...,wonderful lighthearted comedy well suspected k...
3,basically there's a family where a little boy ...,0,basically there family little boy jake think t...,fighting kill thriller arguing like ruin meani...
4,"petter mattei's ""love in the time of money"" is...",1,petter matteis love time money visually stunni...,love stunning success play sophisticated lonel...
5,"probably my all-time favorite movie, a story o...",1,probably alltime favorite movie story selfless...,favorite noble boring truly sympathetic deligh...
6,i sure would like to see a resurrection of a u...,1,sure would like see resurrection dated seahunt...,sure like excitement hero thank like nice doubt
7,"this show was an amazing, fresh & innovative i...",0,show amazing fresh innovative idea first aired...,amazing fresh innovative brilliant funny waste...
8,encouraged by the positive comments about this...,0,encouraged positive comment film looking forwa...,encouraged positive bad mistake truly worst aw...
9,if you like original gut wrenching laughter yo...,1,like original gut wrenching laughter like movi...,like original laughter like love hell liked


In [28]:
# Compare original vs. sentiment-filtered text
print("Original:", cleaned_imdb_datasets['cleaned_review'].iloc[0])
print("Filtered:", cleaned_imdb_datasets['sentiment_text'].iloc[0])

Original: one reviewer mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turned p

In [29]:
# save the resulted data in a new csv file
cleaned_imdb_datasets.to_csv('../data/processed/Cleaned IMDB Dataset with Sentiment.csv', index=False)