In [1]:
import nltk
from nltk.corpus import movie_reviews
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models import Word2Vec

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Download the dataset
nltk.download('movie_reviews')

# Load the reviews data into a DataFrame
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Convert the documents into a pandas DataFrame
data = pd.DataFrame(documents, columns=['Review', 'Sentiment'])

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [3]:
# Initialize stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to clean the text
def preprocess_text(text):
    text = ' '.join(text)  # Join the tokenized words into a string
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # Remove stop words and stem words
    return ' '.join(tokens)

# Apply the preprocessing to each review
data['Cleaned_Review'] = data['Review'].apply(preprocess_text)

In [4]:
' '.join(data['Review'][2])#[0]

"it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late 1960 ' s television show by the same name , the mod squad tells the tale of three reformed criminals under the employ of the police to go undercover . however , things go wrong as evidence gets stolen and they are immediately under suspicion . of course , the ads make it seem like so much more . quick cuts , cool music , claire dane ' s nice hair and cute outfits , car chases , stuff blowing up , and the like . sounds like a cool movie , does it not ? after the first fifteen minutes , it quickly becomes apparent that it is not . the mod squad is certainly a slick looking production , complete with nice hair and costumes , but that simply isn ' t enough . the film is best described as a cross between an hour - long cop show and a music video , both stretched out into the span of an hour and a half . and with it comes every single clich ? . it doesn ' t really

In [5]:
data['Cleaned_Review'][2]

'movi like make jade movi viewer thank invent timex indiglo watch base late 1960 televis show name mod squad tell tale three reform crimin employ polic go undercov howev thing go wrong evid get stolen immedi suspicion cours ad make seem like much quick cut cool music clair dane nice hair cute outfit car chase stuff blow like sound like cool movi first fifteen minut quickli becom appar mod squad certainli slick look product complet nice hair costum simpli enough film best describ cross hour long cop show music video stretch span hour half come everi singl clich realli matter film base televis show plot element recycl everyth alreadi seen charact act noth spectacular sometim even border wooden clair dane omar epp deliv line bore realli transfer onto audienc one escap rel unscath giovanni ribisi play resid crazi man ultim thing worth watch unfortun even enough save convolut mess charact much apart occupi screen time young cast cool cloth nice hair hip soundtrack appear film gear toward te

In [6]:
# Convert the cleaned text into a document-term matrix
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['Cleaned_Review'])

# Initialize LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Get the top words for each topic
for index, topic in enumerate(lda.components_):
    print(f"Top words for topic #{index}")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print("\n")


Top words for topic #0
['scene', 'love', 'time', 'like', 'make', 'life', 'stori', 'charact', 'one', 'film']


Top words for topic #1
['realli', 'even', 'go', 'make', 'charact', 'like', 'get', 'one', 'movi', 'film']


Top words for topic #2
['charact', 'even', 'make', 'time', 'alien', 'effect', 'like', 'one', 'movi', 'film']


Top words for topic #3
['time', 'make', 'charact', 'play', 'get', 'good', 'like', 'one', 'film', 'movi']


Top words for topic #4
['also', 'get', 'scene', 'time', 'even', 'charact', 'like', 'movi', 'one', 'film']




In [7]:
# Train a Word2Vec model
sentences = [review.split() for review in data['Cleaned_Review']]
word2vec_model = Word2Vec(sentences, vector_size=256, window=10, min_count=10, workers=4)

In [8]:
# Find similar words for a given word
word = 'direct'
similar_words = word2vec_model.wv.most_similar(word, topn=3)
print(f"Words similar to '{word}': {similar_words}")

# Replace words in the dataset if needed
def replace_similar_words(text, word2vec_model, word):
    for similar_word, _ in word2vec_model.wv.most_similar(word):
        text = text.replace(similar_word, word)
    return text

# Apply the replacement to the cleaned reviews
data['Cleaned_Review'] = data['Cleaned_Review'].apply(lambda x: replace_similar_words(x, word2vec_model, 'good'))


Words similar to 'direct': [('director', 0.951945424079895), ('written', 0.9232062101364136), ('score', 0.9129873514175415)]


In [9]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['Sentiment'], test_size=0.2, random_state=42)

# Initialize and train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)
print(f"Accuracy: {100 * accuracy_score(y_test, y_pred)}%")


Accuracy: 79.5%


DONE!!!