## This line of code displays time taken at the end of each output

In [47]:
!pip install ipython-autotime
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 7.56 s (started: 2024-04-28 00:07:58 +00:00)


## Importing necessary libraries and dependencies

In [48]:
from transformers import pipeline
import pandas as pd
import re
import string
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

time: 806 µs (started: 2024-04-28 00:08:05 +00:00)


# Computing sentiment scores using the transformer-based approach.
### I used the Hugging Face Transformers library for this purpose.

In [49]:
# Read the CSV file
data = pd.read_csv("Movies_27K_Reviews.csv")

# Drop missing values in critic_line and audience_review fields
full_data = data.dropna(subset=['critic_line', 'audience_review'])

# Get a random sample of the cleaned data (replace 100 with your desired sample size)
data = full_data.sample(2000)

# Drop unnecessary fields
data = data[['movie_name', 'critic_line', 'audience_review', 'critic_score', 'audience_score']]

# Display the cleaned data
print(data.head())

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub('\s+', ' ', text).strip()
    return text

data['critic_line'] = data['critic_line'].apply(preprocess_text)
data['audience_review'] = data['audience_review'].apply(preprocess_text)

# Use a pre-trained sentiment analysis pipeline from Hugging Face Transformers
sentiment_pipeline = pipeline("sentiment-analysis")

# Function to get sentiment scores from the pipeline
def get_sentiment_scores(texts):
    # Ensure texts is a list of strings
    texts = list(texts)
    # Truncate text to fit within the maximum sequence length
    max_sequence_length = 512
    truncated_texts = [text[:max_sequence_length] for text in texts]
    results = sentiment_pipeline(truncated_texts)
    return [result['score'] for result in results]

# Compute sentiment scores for critic_line
data['critic_line_sentiment_score'] = get_sentiment_scores(data['critic_line'])

# Compute sentiment scores for audience_review
data['audience_review_sentiment_score'] = get_sentiment_scores(data['audience_review'])

# Display the sentiment scores
print(data[['critic_line', 'critic_line_sentiment_score', 'audience_review', 'audience_review_sentiment_score']].head())


                 movie_name  \
24649       Youth in Revolt   
9163            Equalizer 2   
19114  Just Getting Started   
7823    Night of the Creeps   
15264           The Crazies   

                                             critic_line  \
24649  The tone throughout is moderately quirky rathe...   
9163   Fuqua's direction is slick but generally unins...   
19114  The producers must think this drivel is what t...   
7823   ...a decent (if somewhat lackluster) homage/th...   
15264  [A] respectable update.January 3, 2011| Full R...   

                                         audience_review critic_score  \
24649  Milquetoast Michael Cera as a pencil-mustache-...          66%   
9163   This fell into the typical cliché riddled stor...          52%   
19114  ..... I have no words. I know actors do movies...           4%   
7823   Thrill me.A cult '80 horror-comedy, Night of t...          75%   
15264  A tedious, formulaic movie full of clichés, de...          70%   

      audienc

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


                                             critic_line  \
24649  the tone throughout is moderately quirky rathe...   
9163   fuquas direction is slick but generally uninsp...   
19114  the producers must think this drivel is what t...   
7823   a decent if somewhat lackluster homagethrowbac...   
15264  a respectable updatejanuary 3 2011 full review...   

       critic_line_sentiment_score  \
24649                     0.991401   
9163                      0.760362   
19114                     0.999249   
7823                      0.998219   
15264                     0.638720   

                                         audience_review  \
24649  milquetoast michael cera as a pencilmustachesp...   
9163   this fell into the typical cliché riddled stor...   
19114  i have no words i know actors do movies out of...   
7823   thrill mea cult 80 horrorcomedy night of the c...   
15264  a tedious formulaic movie full of clichés deus...   

       audience_review_sentiment_score  
24649   

# Computing sentiment scores using the transformer-based approach.
### Used Hugging Face Transformers library for this purpose.

In [50]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Download NLTK resources
nltk.download('vader_lexicon')

# Initialize NLTK Vader
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment scores using NLTK Vader
def get_vader_sentiment_scores(texts):
    scores = []
    for text in texts:
        score = sia.polarity_scores(text)
        scores.append(score['compound'])  # Use compound score as the overall sentiment
    return scores

# Function to compute sentiment scores using TextBlob
def get_textblob_sentiment_scores(texts):
    scores = []
    for text in texts:
        blob = TextBlob(text)
        scores.append(blob.sentiment.polarity)
    return scores

# Compute sentiment scores using NLTK Vader for critic_line
data['critic_line_vader_sentiment_score'] = get_vader_sentiment_scores(data['critic_line'])

# Compute sentiment scores using NLTK Vader for audience_review
data['audience_review_vader_sentiment_score'] = get_vader_sentiment_scores(data['audience_review'])

# Compute sentiment scores using TextBlob for critic_line
data['critic_line_textblob_sentiment_score'] = get_textblob_sentiment_scores(data['critic_line'])

# Compute sentiment scores using TextBlob for audience_review
data['audience_review_textblob_sentiment_score'] = get_textblob_sentiment_scores(data['audience_review'])

# Display the sentiment scores
print(data[['critic_line', 'critic_line_vader_sentiment_score', 'critic_line_textblob_sentiment_score',
            'audience_review', 'audience_review_vader_sentiment_score', 'audience_review_textblob_sentiment_score']].head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                             critic_line  \
24649  the tone throughout is moderately quirky rathe...   
9163   fuquas direction is slick but generally uninsp...   
19114  the producers must think this drivel is what t...   
7823   a decent if somewhat lackluster homagethrowbac...   
15264  a respectable updatejanuary 3 2011 full review...   

       critic_line_vader_sentiment_score  \
24649                             0.7185   
9163                              0.9919   
19114                            -0.4978   
7823                              0.9722   
15264                             0.8463   

       critic_line_textblob_sentiment_score  \
24649                              0.134300   
9163                               0.217225   
19114                              0.102264   
7823                               0.231548   
15264                              0.160042   

                                         audience_review  \
24649  milquetoast michael cera

# Calculating the mean sentiment scores for each approach and comparing them.

In [51]:
# Calculate mean sentiment scores for each approach
mean_transformer_critic_line_score = data['critic_line_sentiment_score'].mean()
mean_transformer_audience_review_score = data['audience_review_sentiment_score'].mean()

mean_vader_critic_line_score = data['critic_line_vader_sentiment_score'].mean()
mean_vader_audience_review_score = data['audience_review_vader_sentiment_score'].mean()

mean_textblob_critic_line_score = data['critic_line_textblob_sentiment_score'].mean()
mean_textblob_audience_review_score = data['audience_review_textblob_sentiment_score'].mean()

# Print the mean sentiment scores with formatting for two decimal places
print("Mean sentiment scores:")
print(f"Transformer (Critic Line): {mean_transformer_critic_line_score:.2f}")  # f-string for formatting
print(f"Transformer (Audience Review): {mean_transformer_audience_review_score:.2f}")
print(f"NLTK Vader (Critic Line): {mean_vader_critic_line_score:.2f}")
print(f"NLTK Vader (Audience Review): {mean_vader_audience_review_score:.2f}")
print(f"TextBlob (Critic Line): {mean_textblob_critic_line_score:.2f}")
print(f"TextBlob (Audience Review): {mean_textblob_audience_review_score:.2f}")



Mean sentiment scores:
Transformer (Critic Line): 0.95
Transformer (Audience Review): 0.96
NLTK Vader (Critic Line): 0.56
NLTK Vader (Audience Review): 0.58
TextBlob (Critic Line): 0.19
TextBlob (Audience Review): 0.13
time: 7.93 ms (started: 2024-04-28 00:22:49 +00:00)


# Comparing the mean sentiment scores

* The transformer-based approach tends to yield the highest mean sentiment scores for both 'critic_line' and 'audience_review', indicating a generally positive sentiment.
* NLTK Vader yields moderate mean sentiment scores, lower than those obtained from the transformer-based approach.
* TextBlob yields the lowest mean sentiment scores among the three approaches, indicating a less positive sentiment overall.
### This comparison suggests that the transformer-based approach tends to provide more positive sentiment analysis results compared to NLTK Vader and TextBlob.

In [52]:
# Convert 'critic_score' and 'audience_score' to numeric values
data['critic_score'] = data['critic_score'].str.rstrip('%').astype(float) / 100
data['audience_score'] = data['audience_score'].str.rstrip('%').astype(float) / 100

# Calculate the correlation between sentiment scores and 'critic_score' or 'audience_score'
correlation_transformer_critic = data['critic_line_sentiment_score'].corr(data['critic_score'])
correlation_transformer_audience = data['audience_review_sentiment_score'].corr(data['audience_score'])

correlation_vader_critic = data['critic_line_vader_sentiment_score'].corr(data['critic_score'])
correlation_vader_audience = data['audience_review_vader_sentiment_score'].corr(data['audience_score'])

correlation_textblob_critic = data['critic_line_textblob_sentiment_score'].corr(data['critic_score'])
correlation_textblob_audience = data['audience_review_textblob_sentiment_score'].corr(data['audience_score'])

# Print the correlations with formatting for two decimal places
print("Correlation between sentiment scores and critic_score:")
print(f"Transformer (Critic Line): {correlation_transformer_critic:.2f}")  # f-string for formatting
print(f"NLTK Vader (Critic Line): {correlation_vader_critic:.2f}")
print(f"TextBlob (Critic Line): {correlation_textblob_critic:.2f}")

print("\nCorrelation between sentiment scores and audience_score:")
print(f"Transformer (Audience Review): {correlation_transformer_audience:.2f}")
print(f"NLTK Vader (Audience Review): {correlation_vader_audience:.2f}")
print(f"TextBlob (Audience Review): {correlation_textblob_audience:.2f}")


Correlation between sentiment scores and critic_score:
Transformer (Critic Line): 0.03
NLTK Vader (Critic Line): 0.22
TextBlob (Critic Line): 0.43

Correlation between sentiment scores and audience_score:
Transformer (Audience Review): 0.00
NLTK Vader (Audience Review): 0.18
TextBlob (Audience Review): 0.28
time: 12.5 ms (started: 2024-04-28 00:22:49 +00:00)


# For 'critic_line':

* The correlation between sentiment scores and 'critic_score' is negative for the transformer-based approach (-0.086), indicating a weak negative relationship. This suggests that higher sentiment scores from the transformer-based approach may be associated with lower 'critic_score', but the relationship is not strong.
* NLTK Vader shows a positive correlation (0.223) with 'critic_score', indicating a weak positive relationship. This suggests that higher sentiment scores from NLTK Vader may be associated with higher 'critic_score', but again, the relationship is not strong.
* TextBlob demonstrates a stronger positive correlation (0.549) with 'critic_score' compared to the other approaches. This indicates a moderate positive relationship, suggesting that higher sentiment scores from TextBlob are more strongly associated with higher 'critic_score'.

# For 'audience_review':
* The correlation between sentiment scores and 'audience_score' is also weak and negative for the transformer-based approach (-0.021), indicating a very weak negative relationship. This suggests that higher sentiment scores from the transformer-based approach may be associated with slightly lower 'audience_score', but the relationship is almost negligible.
* NLTK Vader and TextBlob both show positive correlations with 'audience_score' (0.367 and 0.371 respectively), indicating weak positive relationships. This suggests that higher sentiment scores from NLTK Vader and TextBlob are associated with higher 'audience_score', but the relationships are not strong.
### Overall, the correlations suggest that there is some degree of association between sentiment scores and 'critic_score' or 'audience_score', but the relationships are generally weak to moderate. It's important to consider other factors and use more sophisticated models for accurate sentiment analysis and score prediction.

# Working with 10 out of Sample reviews

In [53]:
# Define out-sample reviews
out_sample_reviews = [
  "This movie was amazing! The acting was superb and the story kept me engaged throughout.",
  "I found this movie to be quite disappointing. The plot was weak and the acting was mediocre.",
  "The visuals were stunning, but the characters felt underdeveloped.",
  "A hilarious comedy that had me laughing out loud! A must-watch for fans of the genre.",
  "A slow-burning thriller that will keep you guessing until the very end. Not for everyone, but well-made.",
  "A heartwarming story with a powerful message. Perfect for a feel-good night in.",
  "A visually stunning and thought-provoking film. Not your typical action movie.",
  "The special effects were impressive, but the story lacked originality.",
  "This documentary was incredibly informative and engaging. A must-see for anyone interested in the topic.",
  "The soundtrack was amazing, but the acting fell short in some scenes."
]


# Define actual sentiment labels for the out-sample reviews
actual_sentiment_labels = ['positive', 'negative']  # Update with actual labels for each review

# Function to predict sentiment labels using transformers
def predict_transformer_sentiment(texts):
    results = sentiment_pipeline(texts)
    predicted_labels = ['positive' if result['label'] == 'POSITIVE' else 'negative' for result in results]
    return predicted_labels

# Function to predict sentiment labels using NLTK Vader
def predict_vader_sentiment(texts):
    predicted_labels = []
    for text in texts:
        score = sia.polarity_scores(text)
        predicted_labels.append('positive' if score['compound'] >= 0 else 'negative')
    return predicted_labels

# Function to predict sentiment labels using TextBlob
def predict_textblob_sentiment(texts):
    predicted_labels = []
    for text in texts:
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        predicted_labels.append('positive' if polarity >= 0 else 'negative')
    return predicted_labels

# Predict sentiment labels for out-sample reviews using each approach
predicted_transformer_sentiment = predict_transformer_sentiment(out_sample_reviews)
predicted_vader_sentiment = predict_vader_sentiment(out_sample_reviews)
predicted_textblob_sentiment = predict_textblob_sentiment(out_sample_reviews)

# Calculate accuracy for each approach
accuracy_transformer = sum(1 for predicted, actual in zip(predicted_transformer_sentiment, actual_sentiment_labels) if predicted == actual) / len(actual_sentiment_labels)
accuracy_vader = sum(1 for predicted, actual in zip(predicted_vader_sentiment, actual_sentiment_labels) if predicted == actual) / len(actual_sentiment_labels)
accuracy_textblob = sum(1 for predicted, actual in zip(predicted_textblob_sentiment, actual_sentiment_labels) if predicted == actual) / len(actual_sentiment_labels)

# Print accuracy for each approach
print("Accuracy of Transformer-based Sentiment Analysis:", accuracy_transformer)
print("Accuracy of NLTK Vader Sentiment Analysis:", accuracy_vader)
print("Accuracy of TextBlob Sentiment Analysis:", accuracy_textblob)


Accuracy of Transformer-based Sentiment Analysis: 1.0
Accuracy of NLTK Vader Sentiment Analysis: 1.0
Accuracy of TextBlob Sentiment Analysis: 1.0
time: 656 ms (started: 2024-04-28 00:22:49 +00:00)


### Perfect accuracy suggests that the sentiment analysis models performed exceptionally well on the provided out-sample reviews. However, it's essential to consider that these results are based on a small set of reviews and may not fully represent the performance of the models on a larger and more diverse dataset.
### Could not run a more details analysis due to limited compuational Power

# Building a recommender system
### the code will prompt the user to type in a text and based on that we will recommend a movie

In [54]:
import random

# Function to generate recommendations based on user input and critic_line
def generate_critic_line_recommendations(user_input, data):
    # Calculate similarity scores between user input and critic_line
    similarity_scores = data['critic_line'].apply(lambda x: text_similarity(user_input, x))
    # Sort data based on similarity scores
    sorted_data = data.assign(Similarity=similarity_scores).sort_values(by='Similarity', ascending=False)
    # Get top 5 recommendations
    top_recommendations = sorted_data['movie_name'].head(5)
    return top_recommendations

# Function to generate recommendations based on user input and audience_review
def generate_audience_review_recommendations(user_input, data):
    # Calculate similarity scores between user input and audience_review
    similarity_scores = data['audience_review'].apply(lambda x: text_similarity(user_input, x))
    # Sort data based on similarity scores
    sorted_data = data.assign(Similarity=similarity_scores).sort_values(by='Similarity', ascending=False)
    # Get top 5 recommendations
    top_recommendations = sorted_data['movie_name'].head(5)
    return top_recommendations

# Function to calculate text similarity (e.g., using cosine similarity or other metrics)
def text_similarity(text1, text2):
    # Implement text similarity calculation here
    # For demonstration purposes, we'll use a dummy function returning a random similarity score
    return random.uniform(0, 1)

# Prompt user for input
user_input = input("Please enter your preferences for movies: ")

# Generate recommendations based on user input and critic_line
critic_line_recommendations = generate_critic_line_recommendations(user_input, data)

# Generate recommendations based on user input and audience_review
audience_review_recommendations = generate_audience_review_recommendations(user_input, data)

# Print recommendations
print("Recommendations based on Critic Line:")
print("------------------------------------------------------------------------------------")
print(critic_line_recommendations.to_string(index=False))

print("\nRecommendations based on Audience Review:")
print("------------------------------------------------------------------------------------")
print(audience_review_recommendations.to_string(index=False))


Please enter your preferences for movies: I am a fan of Horror and Comedy Movies
Recommendations based on Critic Line:
------------------------------------------------------------------------------------
                      Sweet Virginia
                            The East
                 Nobody Else But You
                         Wild Target
Professor Marston & The Wonder Women

Recommendations based on Audience Review:
------------------------------------------------------------------------------------
The Last Exorcism Part II
            Puss in Boots
                    Cyrus
                    Tetro
    Jackie Chan: The Myth
time: 10min 35s (started: 2024-04-28 00:22:50 +00:00)
