# Sentiment Analysis Using Naive Base & Cosine Similarity

In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
# Define IMDb movie review URL
url = "https://www.imdb.com/title/tt0111161/reviews"

In [None]:
# Function to scrape IMDb movie reviews
def scrape_imdb_reviews(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    reviews = []
    for review in soup.find_all('div', class_='text show-more__control'):
        text = review.get_text()
        reviews.append(text)
    return reviews

In [None]:
# Scrape IMDb movie reviews
reviews = scrape_imdb_reviews(url)

In [None]:
# Sample positive and negative words for sentiment analysis
positive_words = ["good", "great", "excellent", "wonderful"]
negative_words = ["bad", "poor", "awful", "terrible"]

In [None]:
# Function to perform sentiment analysis using cosine similarity
def sentiment_analysis_cosine(reviews, positive_words, negative_words):

In [None]:
# Tokenize and preprocess reviews
    nltk.download('punkt')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    def preprocess_text(text):
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
        words = [stemmer.stem(word) for word in words]
        return ' '.join(words)
    
    preprocessed_reviews = [preprocess_text(review) for review in reviews]

In [None]:
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews)

In [None]:
    # Calculate cosine similarity
    cosine_sim = np.dot(tfidf_matrix, tfidf_matrix.T)

In [None]:
    # Classify reviews as positive or negative
    sentiments = []
    for i in range(len(reviews)):
        pos_sim = np.mean(cosine_sim[i, [positive_words.index(word) for word in preprocessed_reviews[i].split() if word in positive_words]])
        neg_sim = np.mean(cosine_sim[i, [negative_words.index(word) for word in preprocessed_reviews[i].split() if word in negative_words]])
        
        if pos_sim > neg_sim:
            sentiments.append("Positive")
        else:
            sentiments.append("Negative")

    return sentiments

In [None]:
# Perform sentiment analysis
sentiments = sentiment_analysis_cosine(reviews, positive_words, negative_words)

In [None]:
# Print the sentiments for each review
for i in range(len(reviews)):
    print(f"Review {i+1}: {sentiments[i]}")

In [None]:
# Sample labels for reviews (you can label your reviews accordingly)
labels = ["Positive", "Negative", "Positive", "Negative", "Positive"]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [None]:
# Function to perform sentiment analysis using Naive Bayes
def sentiment_analysis_naive_bayes(X_train, y_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    clf = MultinomialNB()
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    return y_pred

In [None]:
# Perform sentiment analysis using Naive Bayes
y_pred = sentiment_analysis_naive_bayes(X_train, y_train, X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {accuracy:.2f}")