In [72]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import langdetect

In [73]:
# The data are in the "archive" folder.
# There are two files: one containing the critic reviews (rt_cr.csv)
# and one containing the movies (rt_m.csv).

PATH = Path("archive")
CRITIC_REVIEWS = PATH / "rt_cr.csv"
MOVIES = PATH / "rt_m.csv"

# Read the data into a pandas DataFrame.
critic_reviews = pd.read_csv(CRITIC_REVIEWS)
movies = pd.read_csv(MOVIES)

print("Critic reviews:")
display(critic_reviews.head())
print("Movies:")
display(movies.head())

Critic reviews:


Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


Movies:


Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [74]:
# In the critic reviews, we are interested in the review type (fresh or rotten), the review score and the review content.
reviews_df = critic_reviews[["review_type", "review_score", "review_content"]]
display(reviews_df.head())

Unnamed: 0,review_type,review_score,review_content
0,Fresh,,A fantasy adventure that fuses Greek mythology...
1,Fresh,,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,Fresh,,With a top-notch cast and dazzling special eff...
3,Fresh,3.5/5,Whether audiences will get behind The Lightnin...
4,Rotten,,What's really lacking in The Lightning Thief i...


In [75]:
# How many reviews are there?
print("Number of reviews:", len(reviews_df))

Number of reviews: 1130017


In [76]:
# Let's focus on the review type first.
# We want to know how many reviews are fresh and how many are rotten.
# We can use the value_counts() method to get the counts for each value in the review_type column.

review_type_counts = reviews_df["review_type"].value_counts()
print("Review type counts:")
display(review_type_counts)

# We can also use the value_counts() method to get the percentage of each value in the review_type column.
review_type_percents = reviews_df["review_type"].value_counts(normalize=True)
print("Review type percents:")
display(review_type_percents)

Review type counts:


Fresh     720210
Rotten    409807
Name: review_type, dtype: int64

Review type percents:


Fresh     0.637344
Rotten    0.362656
Name: review_type, dtype: float64

In [77]:
# What would happen if we took into account only the reviwes with a non-null review score?
reviews_df = reviews_df[reviews_df["review_score"].notnull()]
print("Number of reviews with a non-null review score :", len(reviews_df))

Number of reviews with a non-null review score : 824081


In [78]:
# It's a good number so we can continue with this subset of the data.
# Now, it's better to work on a more balanced dataset.
# Of the rotten reviews, what is the distribution of the review scores?
rotten_reviews = reviews_df[reviews_df["review_type"] == "Rotten"]
rotten_reviews_score_counts = rotten_reviews["review_score"].value_counts()
display(rotten_reviews_score_counts)

2/5          59541
2/4          46379
1/5          20675
1.5/4        20424
2.5/4        20180
             ...  
24               1
0/1000           1
5.48/10          1
1.9528/10        1
5.046/10         1
Name: review_score, Length: 425, dtype: int64

In [79]:
# We see there's some imbalance in the way the scores are evaluated: 
# some scores are with 5 as the best, some others with 4 as the best.
# What if we go back and check which one of the two types of scoring is the most common?
reviews_over_five_df = reviews_df[reviews_df["review_score"].str.endswith("/5")]
reviews_over_four_df = reviews_df[reviews_df["review_score"].str.endswith("/4")]
print("Number of reviews over 5:", len(reviews_over_five_df))
print("Number of reviews over 4:", len(reviews_over_four_df))

Number of reviews over 5: 357067
Number of reviews over 4: 258454


In [80]:
# The reviews over five win, and they are still quite a lot.
# Let's keep only the reviews over five.
reviews_df = reviews_over_five_df

# And let's again focus on the rotten reviews.
rotten_reviews = reviews_df[reviews_df["review_type"] == "Rotten"]
rotten_reviews_score_counts = rotten_reviews["review_score"].value_counts()
display(rotten_reviews_score_counts)

2/5        59541
1/5        20675
2.5/5      18482
1.5/5       8211
3/5         7411
0/5         2448
0.5/5       1456
3.5/5         98
2.25/5        80
2.75/5        54
2.3/5         48
2.4/5         45
2.8/5         40
1.75/5        36
2.2/5         30
1.25/5        26
4/5           26
2.9/5         21
1.8/5         18
1.9/5         14
5/5           13
1.7/5         11
1.3/5         10
1.4/5          9
1.2/5          7
2.6/5          7
4.5/5          7
2.1/5          5
1.6/5          5
2.7/5          4
0.75/5         4
3.2/5          3
0.4/5          3
0.8/5          2
0.3/5          2
1.1/5          2
2.85/5         2
0.13/5         2
0.2/5          2
2.255/5        1
5.5/5          1
1.62/5         1
2.11/5         1
9/5            1
8/5            1
1.24/5         1
7/5            1
0.02/5         1
2.12/5         1
2.83/5         1
0.1/5          1
1.35/5         1
4.7/5          1
Name: review_score, dtype: int64

In [81]:
# Even twenty-six reviews with a score of 4/5... weren't they supposed to be rotten???
# And it's not even the strangest thing.
# Now I'm curious, let's see the content of the review with a 4.7/5 score...

strange_review = rotten_reviews[rotten_reviews["review_score"] == "4.7/5"]
print("Strange review:")
display(strange_review)
print(strange_review.review_content.values[0])

Strange review:


Unnamed: 0,review_type,review_score,review_content
1040586,Rotten,4.7/5,Thriller at best qualifies as an interesting a...


Thriller at best qualifies as an interesting attempt at bringing additional perspectives to horror. Given the potential of this particular niche of the horror genre, that also makes it quite the wasted opportunity.


In [82]:
# Unfortunately it's still obscure, at least to me, why this review would have a so high score...

# Anyway, it's maybe better, if we focus on the rotten reviews, to just consider those with 'rotten scores' as well.
# Maybe 0, 0.5, 1, 1.5, 2, 2.5 is a good threshold.
rotten_scores = ["0/5", "0.5/5", "1/5", "1.5/5", "2/5", "2.5/5"]
rotten_reviews = rotten_reviews[rotten_reviews["review_score"].isin(rotten_scores)]
rotten_reviews_score_counts = rotten_reviews["review_score"].value_counts()
print("Number of rotten reviews:", len(rotten_reviews))
display(rotten_reviews_score_counts)

Number of rotten reviews: 110813


2/5      59541
1/5      20675
2.5/5    18482
1.5/5     8211
0/5       2448
0.5/5     1456
Name: review_score, dtype: int64

In [83]:
# And now let's to the same for the fresh reviews.
fresh_reviews = reviews_df[reviews_df["review_type"] == "Fresh"]
fresh_reviews_score_counts = fresh_reviews["review_score"].value_counts()
display(fresh_reviews_score_counts)

4/5       83633
3/5       82862
3.5/5     30787
5/5       24126
4.5/5     12381
2.5/5      2778
2/5         633
3.75/5      137
3.25/5       95
2.75/5       93
4.25/5       72
3.2/5        72
3.8/5        53
4.75/5       41
3.4/5        37
4.2/5        37
1/5          34
2.7/5        31
3.6/5        29
1.5/5        24
4.8/5        23
3.7/5        23
3.3/5        22
4.3/5        20
2.8/5        19
3.1/5        16
3.9/5        14
4.1/5        13
5.5/5        12
2.6/5        12
4.7/5         9
4.4/5         9
2.9/5         7
4.6/5         6
2.3/5         4
3.65/5        3
2.4/5         3
3.35/5        3
2.95/5        2
3.15/5        2
2.50/5        1
2.55/5        1
4.95/5        1
2.89/5        1
4.65/5        1
3.45/5        1
8.5/5         1
9/5           1
9.5/5         1
3.76/5        1
6/5           1
0.5/5         1
3.54/5        1
45/5          1
2.25/5        1
4.35/5        1
Name: review_score, dtype: int64

In [84]:
# Again, out of curiosity... how does a fresh review with a 1/5 look like?
strange_review = fresh_reviews[fresh_reviews["review_score"] == "1/5"]
print("Strange review:")
print(strange_review.review_content.values[0])

Strange review:
Chipwrecked is the sort of Sunday afternoon trifle that will mollify children and mortify their parents.


In [85]:
# You call this a fresh review??
# Let's keep only the fresh reviews with a score of 3/5 or higher.
fresh_scores = ["3/5", "3.5/5", "4/5", "4.5/5", "5/5"]
fresh_reviews = fresh_reviews[fresh_reviews["review_score"].isin(fresh_scores)]
print("Number of fresh reviews:", len(fresh_reviews))
fresh_reviews_score_counts = fresh_reviews["review_score"].value_counts()
display(fresh_reviews_score_counts)

Number of fresh reviews: 233789


4/5      83633
3/5      82862
3.5/5    30787
5/5      24126
4.5/5    12381
Name: review_score, dtype: int64

In [86]:
# Let's stack the rotten and fresh reviews together.
reviews_df = pd.concat([rotten_reviews, fresh_reviews])
print("Number of reviews:", len(reviews_df))
# And let's count the number of reviews for each review score.
reviews_score_counts = reviews_df["review_score"].value_counts()
display(reviews_score_counts)
review_type_counts = reviews_df["review_type"].value_counts()
display(review_type_counts)

Number of reviews: 344602


4/5      83633
3/5      82862
2/5      59541
3.5/5    30787
5/5      24126
1/5      20675
2.5/5    18482
4.5/5    12381
1.5/5     8211
0/5       2448
0.5/5     1456
Name: review_score, dtype: int64

Fresh     233789
Rotten    110813
Name: review_type, dtype: int64

In [87]:
# For how we subset the data we should have from  0/5 to 2.5/5 only rotten reviews and from 3/5 to 5/5 only fresh reviews.
# Let's check if this is true.
# First for the rotten reviews
check_rotten_reviews = reviews_df[reviews_df["review_score"].isin(['0/5', '0.5/5', '1/5', '1.5/5', '2/5', '2.5/5'])]
true_rotten_reviews = reviews_df[reviews_df["review_type"] == "Rotten"]
print("Number of rotten scores:", len(check_rotten_reviews))    
print("Number of rotten reviews:", len(true_rotten_reviews))
# And then for the fresh reviews
check_fresh_reviews = reviews_df[reviews_df["review_score"].isin(['3/5', '3.5/5', '4/5', '4.5/5', '5/5'])]
true_fresh_reviews = reviews_df[reviews_df["review_type"] == "Fresh"]
print("Number of fresh scores:", len(check_fresh_reviews))
print("Number of fresh reviews:", len(true_fresh_reviews))

Number of rotten scores: 110813
Number of rotten reviews: 110813
Number of fresh scores: 233789
Number of fresh reviews: 233789


In [88]:
rotten_scores + fresh_scores

['0/5',
 '0.5/5',
 '1/5',
 '1.5/5',
 '2/5',
 '2.5/5',
 '3/5',
 '3.5/5',
 '4/5',
 '4.5/5',
 '5/5']

In [89]:
# Everything adds up :)
# So now, to have a balanced test, we can take some random samples equally
# from all the different scores, but careful... there are 6 scores for the rotten reviews and 5 for the fresh reviews.
# We'll take 1000 samples from each of the scores, but to balance things out we'll take 2000 samples from the score 3/5

clean_reviews_df = pd.DataFrame(columns=reviews_df.columns)
N_SAMPLES = 1000
for score in rotten_scores + fresh_scores:
    if score == "3/5":
        score_reviews = reviews_df[reviews_df["review_score"] == score].sample(2*N_SAMPLES)
    else:
        score_reviews = reviews_df[reviews_df["review_score"] == score].sample(N_SAMPLES)
    clean_reviews_df = pd.concat([clean_reviews_df, score_reviews])

In [90]:
# Let's check if we have the same number of reviews for each score.
clean_reviews_score_counts = clean_reviews_df["review_score"].value_counts()
display(clean_reviews_score_counts)
clean_review_type_counts = clean_reviews_df["review_type"].value_counts()
display(clean_review_type_counts)

3/5      2000
0/5      1000
0.5/5    1000
1/5      1000
1.5/5    1000
2/5      1000
2.5/5    1000
3.5/5    1000
4/5      1000
4.5/5    1000
5/5      1000
Name: review_score, dtype: int64

Rotten    6000
Fresh     6000
Name: review_type, dtype: int64

In [108]:
# Let's get rid of the reviews without an actual content and duplicates.
clean_reviews = clean_reviews_df.dropna().drop_duplicates()
clean_reviews = clean_reviews[~clean_reviews['review_content'].str.contains("click for full review")]
display(clean_reviews.sample(10))

Unnamed: 0,review_type,review_score,review_content
257694,Rotten,0.5/5,Everything else here -- from the gross caricat...
855600,Fresh,3.5/5,Spiderwick hits the high notes when it needs t...
313890,Fresh,4/5,"You can't teach an old dog new tricks, but whe..."
637770,Rotten,2.5/5,Although Rossi hits on a lot of interesting su...
883054,Fresh,5/5,Stoker proves that not only can Park Chan-Wook...
699877,Rotten,2.5/5,It's a fitting summation of Spielberg's career...
464879,Fresh,3/5,"In an era of peak zombie, it's rare to find a ..."
113064,Rotten,0.5/5,A lost opportunity. [Full review in Spanish]
489986,Fresh,5/5,QT's back and damn if the wait wasn't well wor...
278570,Rotten,2.5/5,how does one ever live up to the promise of Na...


In [92]:
# By lucky chance we could see that there are some reviews in other languages.. 
# better to get rid of them.

def detect_english_language(text):
    try:
        return langdetect.detect(text) == "en"
    except:
        return False

In [94]:
# Let's keep only the reviews in English.
en_clean_reviews = clean_reviews[clean_reviews["review_content"].apply(detect_english_language)]
display(en_clean_reviews)

Unnamed: 0,review_type,review_score,review_content
380178,Rotten,0/5,"A colossal, mega-budget turkey...Gods of Egypt..."
66550,Rotten,0/5,This is a terrible movie with barely concealed...
400455,Rotten,0/5,Nothing but an impersonal product manipulating...
1046130,Rotten,0/5,"At times, it doesn't seem like movies can achi..."
531967,Rotten,0/5,"Although Tim Burton had nothing to do with it,..."
...,...,...,...
19645,Fresh,5/5,It's not hard to see the appeal of Robert Aldr...
554404,Fresh,5/5,Who would've guessed that one of the hottest s...
155736,Fresh,5/5,"The kind of skittish, reckless film-making tha..."
497017,Fresh,5/5,Knives Out is a film for lovers of murder myst...


In [98]:
# Let's see some of the non-english reviews.
non_en_reviews = clean_reviews.drop(en_clean_reviews.index)
display(non_en_reviews.sample(30))

Unnamed: 0,review_type,review_score,review_content
287234,Rotten,0/5,Depressing
1069139,Rotten,2.5/5,Heigl and Dawson-and DiNovi-deserve better. So...
646544,Fresh,3.5/5,Surprisingly enjoyable.
602490,Rotten,0.5/5,Almost entirely garbage.
163174,Rotten,1/5,O culto a este filme prova que absolutamente q...
224500,Rotten,0/5,A spirit-sapping exercise in female degradatio...
694697,Fresh,5/5,A magisterial achievement.
711224,Fresh,3/5,Really funny and well-made.
134453,Rotten,0/5,"No, he does not."
1062921,Fresh,3.5/5,Quite enjoyable


In [None]:
# Not what we expected, there are too many false positives.. but anyway...

In [101]:
# Let's do one last more adjustment before we start with the actual sentiment analysis.
# We'll map the string scores to actual numerical values.
# We'll map the review type to numerical values as well: 0 for rotten and 1 for fresh.

def map_score_to_number(score):
    if score == "0/5":
        return 0
    elif score == "0.5/5":
        return 0.5
    elif score == "1/5":
        return 1
    elif score == "1.5/5":
        return 1.5
    elif score == "2/5":
        return 2
    elif score == "2.5/5":
        return 2.5
    elif score == "3/5":
        return 3
    elif score == "3.5/5":
        return 3.5
    elif score == "4/5":
        return 4
    elif score == "4.5/5":
        return 4.5
    elif score == "5/5":
        return 5
    
def map_review_type_to_number(review_type):
    if review_type == "Rotten":
        return 0
    elif review_type == "Fresh":
        return 1

In [102]:
fin_en_clean_reviews = en_clean_reviews.assign(review_score=en_clean_reviews["review_score"].apply(map_score_to_number))
fin_en_clean_reviews = fin_en_clean_reviews.assign(review_type=fin_en_clean_reviews["review_type"].apply(map_review_type_to_number))
display(fin_en_clean_reviews.sample(20))

Unnamed: 0,review_type,review_score,review_content
793034,1,4.5,An eye-opening and gripping thriller that pose...
196095,1,3.0,Okay remake. Might have been better is screenw...
305091,0,1.0,A laughable disaster.
1087204,1,3.0,"Ari Folman, who wrote and directed Waltz with ..."
965571,0,2.5,Rupert Everett's mostly handsome movie never q...
374819,1,4.0,"""There never was a woman like Gilda!"" drooled ..."
237790,1,3.0,Child 44 grows in stature and suspense as it g...
28411,0,0.0,Ugh! Ick! Gag me with a spoon! This film is an...
636528,0,1.0,A ludricrous retread of the same old Fatal Att...
1007693,1,3.5,The Raid 2 doesn't so much raise the bar for a...


In [103]:
# Save to a csv file the results obtained so far.
fin_en_clean_reviews.to_csv("clean_reviews.csv", index=False)