# DSC550-T301 Data Mining

## Week 3: 3.2 Exercise: Sentiment Analysis and Preprocessing Text / Daniel Solis Toro

### PART 1 — Using the TextBlob Sentiment Analyzer

1. Import the movie review data

In [4]:
# Import libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("labeledTrainData.tsv", sep="\t")

# Display dimensions
df.shape

(25000, 3)

2. Count positive and negative reviews

In [6]:
df['sentiment'].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

3. Classify sentiment using TextBlob

In [8]:
# Install and import TextBlob
from textblob import TextBlob

# Define a function to classify sentiment
def textblob_sentiment(review):
    blob = TextBlob(review)
    polarity = blob.sentiment.polarity
    
    if polarity >= 0:
        return 1   # positive
    else:
        return 0   # negative

# Apply the function to all reviews
df['tb_prediction'] = df['review'].apply(textblob_sentiment)

4. Check accuracy of TextBlob

In [10]:
# Import accuracy metric
from sklearn.metrics import accuracy_score

# Compute accuracy
accuracy_tb = accuracy_score(df['sentiment'], df['tb_prediction'])
accuracy_tb

0.68524

### Is this better than random guessing?

- Random guessing = 50% accuracy
- TextBlob accuracy = 68.5%

Yes, it’s better than random, but not great

### Extra Credit: VADER Sentiment Analyzer

In [13]:
# Import VADER
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

# Create prediction function
def vader_sentiment(review):
    score = sia.polarity_scores(review)['compound']
    
    if score >= 0:
        return 1
    else:
        return 0

# Apply and evaluate
df['vader_prediction'] = df['review'].apply(vader_sentiment)

accuracy_vader = accuracy_score(df['sentiment'], df['vader_prediction'])
accuracy_vader

0.69356

### Is this better than random guessing?

- Random guessing = 50% accuracy
- VADER accuracy = 69.3%

Yes, it’s better than random, but not great

### PART 2 — Prepping Text for a Custom Model

1. Convert text to lowercase

In [17]:
df['clean_review'] = df['review'].str.lower()

2. Remove punctuation and special characters

In [19]:
import re

df['clean_review'] = df['clean_review'].apply(
    lambda x: re.sub(r'[^a-z\s]', '', x)
)

3. Remove stop words

In [21]:
# Import stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Remove them
df['clean_review'] = df['clean_review'].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)



4. Apply Porter Stemmer

In [23]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

df['stemmed_review'] = df['clean_review'].apply(
    lambda x: " ".join([ps.stem(word) for word in x.split()])
)


5. Create a Bag-of-Words matrix

In [25]:
# Import vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create BoW matrix
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['stemmed_review'])

# Display dimensions
bow_matrix.shape


(25000, 89468)

6. Create a TF-IDF matrix

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['stemmed_review'])

# Display dimensions
tfidf_matrix.shape


(25000, 89468)