## Part 1: Using the TextBlob Sentiment Analyzer

In [1]:
import pandas as pd


# Load file
file_path = '/Users/cheribeda/Downloads/word2vec-nlp-tutorial/LTD.tsv'

# Load the TSV file into a DataFrame
df = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the DataFrame to verify it's loaded correctly
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [2]:
# Count the occurrences of each sentiment value
sentiment_counts = df['sentiment'].value_counts()

# Print the counts
print("Number of Positive Reviews:", sentiment_counts[1])
print("Number of Negative Reviews:", sentiment_counts[0])


Number of Positive Reviews: 12500
Number of Negative Reviews: 12500


In [6]:
pip install textblob


Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/02/07/5fd2945356dd839974d3a25de8a142dc37293c21315729a41e775b5f3569/textblob-0.18.0.post0-py3-none-any.whl.metadata
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.


In [9]:
from textblob import TextBlob

# Define a function to classify sentiment
def classify_sentiment(review):
    # Create a TextBlob object for the review text
    blob = TextBlob(review)
    
    # Get the polarity score of the review
    polarity_score = blob.sentiment.polarity
    
    # Classify sentiment based on polarity score
    if polarity_score >= 0:
        return 'positive'
    else:
        return 'negative'

# Apply the function to classify sentiment for each review
df['sentiment_classified'] = df['review'].apply(classify_sentiment)

# Display the first few rows of the DataFrame with the classified sentiment
df.head()


Unnamed: 0,id,sentiment,review,sentiment_classified
0,5814_8,1,With all this stuff going down at the moment w...,positive
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",positive
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,negative
3,3630_4,0,It must be assumed that those who praised this...,positive
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,negative


In [7]:
# Calculate accuracy
correct_predictions = (df['sentiment_classified'] == df['sentiment']).sum()
total_predictions = len(df)
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)



Accuracy: 0.0


In [8]:
# Compare with random guessing
random_guessing_accuracy = 0.5  # equal chance for positive and negative sentiment
print("Random Guessing Accuracy:", random_guessing_accuracy)


Random Guessing Accuracy: 0.5


In [10]:
pip install vaderSentiment


Collecting vaderSentiment
  Obtaining dependency information for vaderSentiment from https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to classify sentiment using VADER
def classify_sentiment_vader(review):
    # Analyze sentiment using VADER
    sentiment_scores = analyzer.polarity_scores(review)
    
    # Classify sentiment based on compound score
    compound_score = sentiment_scores['compound']
    if compound_score >= 0:
        return 0  # 0 for positive sentiment
    else:
        return 1  # 1 for negative sentiment

# Apply the VADER-based sentiment classification function to classify sentiment for each review
df['sentiment_classified_vader'] = df['review'].apply(classify_sentiment_vader)

# Calculate accuracy of VADER-based sentiment analysis
correct_predictions_vader = (df['sentiment_classified_vader'] == df['sentiment']).sum()
accuracy_vader = correct_predictions_vader / total_predictions
print("VADER Accuracy:", accuracy_vader)


VADER Accuracy: 0.30596


## Part 2: Prepping Text for a Custom Model

In [12]:
# Convert all text to lowercase letters
df['review_lower'] = df['review'].str.lower()

# Display the first few rows of the DataFrame with lowercase text
print(df['review_lower'].head())


0    with all this stuff going down at the moment w...
1    \the classic war of the worlds\" by timothy hi...
2    the film starts with a manager (nicholas bell)...
3    it must be assumed that those who praised this...
4    superbly trashy and wondrously unpretentious 8...
Name: review_lower, dtype: object


In [14]:
import re

# Define a function to remove punctuation and special characters
def remove_special_characters(text):
    # Define the pattern to match punctuation and special characters
    pattern = r'[^a-zA-Z0-9\s]'
    # Replace the punctuation and special characters with an empty string
    text_cleaned = re.sub(pattern, '', text)
    return text_cleaned

# Apply the function to remove punctuation and special characters from the text
df['review_cleaned'] = df['review_lower'].apply(remove_special_characters)

# Display the first few rows of the DataFrame with cleaned text
print(df['review_cleaned'].head())


0    with all this stuff going down at the moment w...
1    the classic war of the worlds by timothy hines...
2    the film starts with a manager nicholas bell g...
3    it must be assumed that those who praised this...
4    superbly trashy and wondrously unpretentious 8...
Name: review_cleaned, dtype: object


In [15]:
import nltk
from nltk.corpus import stopwords

# Download the list of stop words
nltk.download('stopwords')

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Define a function to remove stop words
def remove_stop_words(text):
    # Split the text into words
    words = text.split()
    # Remove stop words
    words_filtered = [word for word in words if word.lower() not in stop_words]
    # Join the words back into a single string
    text_filtered = ' '.join(words_filtered)
    return text_filtered

# Apply the function to remove stop words from the text
df['review_filtered'] = df['review_cleaned'].apply(remove_stop_words)

# Display the first few rows of the DataFrame with stop words removed
print(df['review_filtered'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cheribeda/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    stuff going moment mj ive started listening mu...
1    classic war worlds timothy hines entertaining ...
2    film starts manager nicholas bell giving welco...
3    must assumed praised film greatest filmed oper...
4    superbly trashy wondrously unpretentious 80s e...
Name: review_filtered, dtype: object


In [16]:
import nltk
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Define a function to apply PorterStemmer to text
def apply_stemming(text):
    # Tokenize the text into words
    words = text.split()
    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the stemmed words back into a single string
    text_stemmed = ' '.join(stemmed_words)
    return text_stemmed

# Apply the function to apply PorterStemmer to the text
df['review_stemmed'] = df['review_filtered'].apply(apply_stemming)

# Display the first few rows of the DataFrame with PorterStemmer applied
print(df['review_stemmed'].head())


0    stuff go moment mj ive start listen music watc...
1    classic war world timothi hine entertain film ...
2    film start manag nichola bell give welcom inve...
3    must assum prais film greatest film opera ever...
4    superbl trashi wondrous unpretenti 80 exploit ...
Name: review_stemmed, dtype: object


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the stemmed text data
bag_of_words_matrix = vectorizer.fit_transform(df['review_stemmed'])

# Display the dimensions of the bag-of-words matrix
print("Dimensions of the bag-of-words matrix:", bag_of_words_matrix.shape)


Dimensions of the bag-of-words matrix: (25000, 92226)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the stemmed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review_stemmed'])

# Display the dimensions of the TF-IDF matrix
print("Dimensions of the TF-IDF matrix:", tfidf_matrix.shape)


Dimensions of the TF-IDF matrix: (25000, 92226)
