# Bag of Words Meets Bags of Popcorn

Filename: movie-review-model.ipynb \
Author: Timothy Holland \
Last updated: 14/06/2024 \
Kaggle competition: https://www.kaggle.com/c/word2vec-nlp-tutorial/data

## Data Summary

In [1]:
import pandas as pd

data = pd.read_csv('word2vec-nlp-tutorial/labeledTrainData.tsv', sep='\t')

In [3]:
from collections import Counter

# Print the first few rows of the data
print("First few rows of the data:")
print(data.head())

# Get the total number of samples
total_samples = len(data)
print(f"\nTotal number of samples: {total_samples}")

# Get the unique sentiments and their counts
sentiment_counts = Counter(data['sentiment'])
print("\nSentiment distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count} ({count/total_samples*100:.2f}%)")

# Get the average length of the reviews
avg_review_length = data['review'].apply(len).mean()
print(f"\nAverage review length: {avg_review_length:.2f} characters")

# Get the number of unique words in the reviews
unique_words = set()
data['review'].str.lower().str.split().apply(unique_words.update)
print(f"\nNumber of unique words: {len(unique_words)}")

# Print a random sample from the data
print("\nRandom sample:")
sample = data.sample().iloc[0]
print(f"ID: {sample['id']}")
print(f"Review: {sample['review'][:200]}...") # Print first 200 characters
print(f"Sentiment: {sample['sentiment']}")

# Calculate correlation between review length and sentiment
data['review_length'] = data['review'].apply(len)
correlation = data['review_length'].corr(data['sentiment'])
print(f"\nCorrelation between review length and sentiment: {correlation:.4f}")

# Print the most common words
word_counts = Counter()
data['review'].str.lower().str.split().apply(word_counts.update)
print("\nTop 10 most common words:")
for word, count in word_counts.most_common(10):
    print(f"{word}: {count}")

First few rows of the data:
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...

Total number of samples: 25000

Sentiment distribution:
1: 12500 (50.00%)
0: 12500 (50.00%)

Average review length: 1327.71 characters

Number of unique words: 257663

Random sample:
ID: 6827_10
Review: I spent 5 hours drenched in this film. Nothing I have ever seen comes close to the delicious funk this film left me in. Never mind females advanced aging dilemma's, human fear vaults off the screen fo...
Sentiment: 1

Correlation between review length and sentiment: 0.0219

Top 10 most common words:
the: 322198
a: 159949
and: 158572
o

## Data Processing

#### 

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data (uncomment these lines if you haven't downloaded them before)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Preprocessing functions
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the 'review' column
data['preprocessed_review'] = data['review'].apply(preprocess_text)

# Print the first few rows of the preprocessed data
print("Preprocessed data:")
print(data[['id', 'sentiment', 'preprocessed_review']].head())

Preprocessed data:
       id  sentiment                                preprocessed_review
0  5814_8          1  stuff going moment mj ive started listening mu...
1  2381_9          1  classic war world timothy hines entertaining f...
2  7759_3          0  film start manager nicholas bell giving welcom...
3  3630_4          0  must assumed praised film greatest filmed oper...
4  9495_8          1  superbly trashy wondrously unpretentious explo...


#### Embedding

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Split the preprocessed reviews into words
tokenized_reviews = [review.split() for review in data['preprocessed_review']]

# Train the Word2Vec model
embedding_size = 100
window_size = 5
min_word_count = 5

model = Word2Vec(tokenized_reviews, vector_size=embedding_size, window=window_size, min_count=min_word_count)

# Get the vocabulary
vocabulary = list(model.wv.index_to_key)

# Get the word vectors
word_vectors = model.wv[vocabulary]

# Perform t-SNE dimensionality reduction for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(word_vectors)

# Plot the word embeddings
plt.figure(figsize=(10, 8))
for i, word in enumerate(vocabulary):
    x, y = embeddings_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')

plt.title('Word Embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()

# Find similar words
query_word = 'great'
similar_words = model.wv.most_similar(query_word, topn=10)
print(f"Similar words to '{query_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")