# Introduction

## Goals
* Make goals list

# Load In Cleaned DataFrame

In [351]:
# Import dependencies
import pandas as pd
import time
import datetime as dt
import string # for parsing non-english titles
import numpy as np

In [352]:
start_load_time = time.time()

#clean_reviews_df = pd.read_csv("imdbReviewsClean.csv", sep="\t")
clean_reviews_df = pd.read_csv("imdbReviewsSmall.csv", sep="\t")
print(f"Load complete, total time {time.time() - start_load_time:.2f} seconds\n")

reviews_count_all = len(clean_reviews_df)
print(f"\nTotal reviews in working data: {reviews_count_all}")

clean_reviews_df.head()

Load complete, total time 0.75 seconds


Total reviews in working data: 68881


Unnamed: 0,movie,rating,review_summary,review_date,spoiler_tag,review_detail,year,title
0,The Droving (2020),2,An honest review,2020-05-03,False,Here's the truth. There's not much to this mov...,2020,The Droving
1,All About Eve (1950),10,Amazing,2020-05-03,False,Having seen this film for the first time today...,1950,All About Eve
2,Runaway Train (1985),7,Impressive action scenes!,2020-05-03,False,The movie had some very impressive scenes. Esp...,1985,Runaway Train
3,The Half of It (I) (2020),4,Needed the other half of the movie to cover up...,2020-05-03,False,I see that Netflix has a teenage/kids audience...,2020,The Half of It (I)
4,Closure (I) (2018),9,Fun and intriguing,2020-05-03,False,This is a fun and intriguing mystery. The acti...,2018,Closure (I)


In [353]:
# Expect we only need to fix review_date
print(clean_reviews_df.dtypes, "\n")
clean_reviews_df["review_date"] = pd.to_datetime(clean_reviews_df["review_date"])
print(clean_reviews_df.dtypes)

movie             object
rating             int64
review_summary    object
review_date       object
spoiler_tag         bool
review_detail     object
year               int64
title             object
dtype: object 

movie                     object
rating                     int64
review_summary            object
review_date       datetime64[ns]
spoiler_tag                 bool
review_detail             object
year                       int64
title                     object
dtype: object


In [354]:
# Which movies have the most reviews?
movie_review_value_counts = clean_reviews_df["title"].value_counts()
unique_movie_count = len(movie_review_value_counts)
print(f"{unique_movie_count:,} unique movies\n")
most_reviewed_movie = movie_review_value_counts.head(1).index[0]
most_reviewed_movie_count = movie_review_value_counts.head(1)[0]
print(f"Most reviewed movie in working data: {most_reviewed_movie} with {most_reviewed_movie_count:,} reviews.\n")
print(movie_review_value_counts.head(15))

24,921 unique movies

Most reviewed movie in working data: Mrs. Serial Killer with 957 reviews.

Mrs. Serial Killer               957
Mulan                            656
Thappad (I)                      487
Deadpool                         475
Tenet                            431
小丑                               355
The Day the Earth Stood Still    317
The Invisible Man (I)            212
星際救援                             198
I Am Not Okay with This          193
The Shawshank Redemption         176
I'm Thinking of Ending Things    165
Parasite                         163
The Dark Knight                  160
Twilight (I)                     156
Name: title, dtype: int64


It is interesting that some of the most-reviewed titles are for non-english films. Eventually we'll want to dive into the text of the actual reviews themselves, so to prepare for that step we can filter out reviews for movies with foreign (non-ascii) letters in their title, under the assumption that most reviews for these records will be written in the same language as the title.

# Classifying Reviews by Foreign Characters in Movie Title

In [355]:
# How many titles have non-english characters?
# Can use .isascii() to detect these, like below:
badtext = "星際救援"
goodtext = "The Invisible Man (I) "
print(f"{badtext} is only english characters: {badtext.isascii()}")
print(f"{goodtext} is only english characters: {goodtext.isascii()}")

星際救援 is only english characters: False
The Invisible Man (I)  is only english characters: True


In [356]:
# Split english and foreign movie titles using isascii() 
unique_movies_list = list(movie_review_value_counts.index.values)
english_titles = []
foreign_titles = []

# Create a list of integers up to the length of unique_movies_list, and iterate through that to hit every item in list
for title in np.arange(0, len(unique_movies_list)):
    if unique_movies_list[title].isascii():
        english_titles.append(unique_movies_list[title])
    else:
        foreign_titles.append(unique_movies_list[title])

In [357]:
# Print summary stats for the new lists
english_movie_count = len(english_titles)
foreign_movie_count = len(foreign_titles)
english_movie_pct = english_movie_count / unique_movie_count * 100
foreign_movie_pct = foreign_movie_count / unique_movie_count * 100
print(f"English Movie Title Count: {english_movie_count:,} ({english_movie_pct:.2f}%)")
print(f"Foreign Movie Title Count: {foreign_movie_count:,} ({foreign_movie_pct:.2f}%)")

english_movie_review_count = len(clean_reviews_df[clean_reviews_df["title"].isin(english_titles)])
foreign_movie_review_count = len(clean_reviews_df[clean_reviews_df["title"].isin(foreign_titles)])
english_movie_review_pct = english_movie_review_count / reviews_count_all * 100
foreign_movie_review_pct = foreign_movie_review_count / reviews_count_all * 100
print(f"\nReviews on English Movie Titles: {english_movie_review_count:,} ({english_movie_review_pct:.2f})%")
print(f"Reviews on Foreign Movie Titles: {foreign_movie_review_count:,} ({foreign_movie_review_pct:.2f})%")

English Movie Title Count: 23,294 (93.47%)
Foreign Movie Title Count: 1,627 (6.53%)

Reviews on English Movie Titles: 64,719 (93.96)%
Reviews on Foreign Movie Titles: 4,162 (6.04)%


In [358]:
# Check foreign titles to see if that worked as intended
list(foreign_titles)[0:15]

['小丑',
 '星際救援',
 '愛爾蘭人',
 '婚姻故事',
 '吸血鬼戰爭',
 '藍波：最後一滴血',
 '仲夏魘',
 '鋒迴路轉',
 '安眠醫生',
 'WALL·E',
 '野蠻遊戲：全面晉級',
 '從前，有個好萊塢',
 '弒婚遊戲 (I)',
 '守護者',
 '冰雪奇緣2']

In [359]:
# Some of the rows we filtered have some english (ascii) characters
# Pull foreign titles that have at least 2 ascii characters
def hasSomeAscii(inText):
    english_chars = 0
    for character in np.arange(0, len(inText)):
        if inText[character].isspace():
            continue
        if inText[character].isascii():
            english_chars+=1
    return english_chars > 1

In [360]:
# Extract records with some ascii characters in title from unique foreign film title list
foreign_movies_with_ascii = []
for title in np.arange(0, len(foreign_titles)):
    if hasSomeAscii(foreign_titles[title]):
        foreign_movies_with_ascii.append(foreign_titles[title])

foreign_movies_with_ascii_count = len(foreign_movies_with_ascii)
foreign_movies_with_ascii_pct = foreign_movies_with_ascii_count / foreign_movie_count * 100
print(f"Found {foreign_movies_with_ascii_count:,} records with partial ASCII among {foreign_movie_count:,} foreign titles ({foreign_movie_review_pct:.2f}%)")

Found 628 records with partial ASCII among 1,627 foreign titles (6.04%)


In [361]:
# Check names of "foreign movies" with 2 or more ascii (english) characters in title
foreign_movies_with_ascii[0:20]

['WALL·E',
 '弒婚遊戲 (I)',
 'Pokémon: Mewtwo Strikes Back - Evolution',
 'Léon: The Professional',
 'Les Misérables',
 '茱蒂 (II)',
 '我們 (II)',
 '刺激1995',
 'Amélie',
 'Irréversible',
 '安娜 (II)',
 'Capharnaüm',
 '鱷魔 (I)',
 'A+瞎妹',
 'Cinayet Süsü',
 '搞鬼 (II)',
 'Erufen rîto',
 '靠譜歌王 (III)',
 'Aliens: A Redenção',
 '國王 (I)']

This indicates an issue with how we distinguished between English and Foreign titles in our earlier step. Even just in the first handful, we have three English movies:
``` 'WALL·E' 'Pokémon: Mewtwo Strikes Back - Evolution' 'Léon: The Professional' ``` that belong in our English Titles group. Previously, we sorted a title into the "Foreign" bucket if it had ANY non-ascii characters. If we increase that limit to 2 non-ascii chars, we would have the opposite issue with ``` '茱蒂 (II)' ``` being included in the English titles. At 1 ascii character, we would correctly sort all 4 of the titles mentioned.

Below I'll make an inverted version of the hasSomeAscii() that will return True for text that contains 0 or 1 non-ascii character. Then, we can re-filter the unique titles using that function and get more accurate results. 


In [362]:
def isMostlyAscii(inText):
    non_english_chars = 0
    for character in np.arange(0, len(inText)):
        if inText[character].isspace():
            continue
        if not inText[character].isascii():
            non_english_chars+=1
    return non_english_chars < 2

In [363]:
# Redo sorting movie titles into english and foreign lists using isMostlyAscii()
english_titles_revised = []
foreign_titles_revised = []

# Create a list of integers up to the length of unique_movies_list, and iterate through that to hit every item in list
for title in np.arange(0, len(unique_movies_list)):
    if isMostlyAscii(unique_movies_list[title]):
        english_titles_revised.append(unique_movies_list[title])
    else:
        foreign_titles_revised.append(unique_movies_list[title])

In [364]:
# Run the same summary stats as before, expecting a lower foreign movie % 
english_movie_count = len(english_titles_revised)
foreign_movie_count = len(foreign_titles_revised)
english_movie_pct = english_movie_count / unique_movie_count * 100
foreign_movie_pct = foreign_movie_count / unique_movie_count * 100
print(f"English Movie Title Count: {english_movie_count:,} ({english_movie_pct:.2f}%)")
print(f"Foreign Movie Title Count: {foreign_movie_count:,} ({foreign_movie_pct:.2f}%)")

english_movie_review_count = len(clean_reviews_df[clean_reviews_df["title"].isin(english_titles_revised)])
foreign_movie_review_count = len(clean_reviews_df[clean_reviews_df["title"].isin(foreign_titles_revised)])
english_movie_review_pct = english_movie_review_count / reviews_count_all * 100
foreign_movie_review_pct = foreign_movie_review_count / reviews_count_all * 100
print(f"\nReviews on English Movie Titles: {english_movie_review_count:,} ({english_movie_review_pct:.2f})%")
print(f"Reviews on Foreign Movie Titles: {foreign_movie_review_count:,} ({foreign_movie_review_pct:.2f})%")

English Movie Title Count: 23,676 (95.00%)
Foreign Movie Title Count: 1,245 (5.00%)

Reviews on English Movie Titles: 65,333 (94.85)%
Reviews on Foreign Movie Titles: 3,548 (5.15)%


In [365]:
# That looks about right, let's peek at the foreign titles to confirm
list(foreign_titles_revised)[0:15]

['小丑',
 '星際救援',
 '愛爾蘭人',
 '婚姻故事',
 '吸血鬼戰爭',
 '藍波：最後一滴血',
 '仲夏魘',
 '鋒迴路轉',
 '安眠醫生',
 '野蠻遊戲：全面晉級',
 '從前，有個好萊塢',
 '弒婚遊戲 (I)',
 '守護者',
 '冰雪奇緣2',
 '舞孃騙很大']

Definitely an improvement, but the presence of ```'STAR WARS：天行者的崛起'``` suggests a larger problem with the data. There may be multiple unique "title"s that represent the same movie in different languages. That means we can't treat Title as the unique ID of each movie - instead, we have to use it as a key representing both Movie Title and Language. More practically, we'll have to filter out the foreign titles so that we can accurately summarize reviews on the English langauge version of all movies in the data.

In [366]:
# Filter out foreign movies by only taking titles in our english_titles_revised list
english_movies_df = clean_reviews_df[clean_reviews_df["title"].isin(english_titles_revised)]
english_movies_reviews_count = len(english_movies_df)
english_title_counts = english_movies_df["title"].value_counts()
english_unique_title_count = len(english_title_counts)
english_title_pct = english_unique_title_count / unique_movie_count * 100

print(f"Most reviewed English-title movie: {english_review_title_counts.index[0]}\n")
print(english_title_counts.head(10),"\n")
print(f"Out of {unique_movie_count:,} unique titles {english_unique_title_count:,} contain less than 2 non-ascii characters ({english_title_pct:.2f}%)")

Most reviewed English-title movie: Avengers: Endgame

Mrs. Serial Killer               957
Mulan                            656
Thappad (I)                      487
Deadpool                         475
Tenet                            431
The Day the Earth Stood Still    317
The Invisible Man (I)            212
I Am Not Okay with This          193
The Shawshank Redemption         176
I'm Thinking of Ending Things    165
Name: title, dtype: int64 

Out of 24,921 unique titles 23,676 contain less than 2 non-ascii characters (95.00%)


# Review Count per Movie

In [367]:
# How concentrated is review activity?
print("Summary Stats: Count of Reviews per Unique Movie")
print(english_title_counts.describe()[1:]) # "count" below refers to total unique movies 

Summary Stats: Count of Reviews per Unique Movie
mean      2.759461
std      11.543721
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max     957.000000
Name: title, dtype: float64


In [368]:
# What % of movies recieves more than the avg. amount of reviews?
third_qt_reviews = english_title_counts.describe()["75%"]
top_qt_reviews_count = len(english_title_counts[english_title_counts > third_qt_reviews])
print(f"Third quartile total reviews on unique movie: {third_qt_reviews:.0f}")
print(f"Movies with above third quartile review count: {top_qt_reviews_count:,} / {english_unique_title_count:,} = {top_qt_reviews_count / english_unique_title_count * 100:.2f}%")

Third quartile total reviews on unique movie: 2
Movies with above third quartile review count: 4,819 / 23,676 = 20.35%


In [374]:
# Split data into popular and unpopular subsets based on total reviews for reviewed movie
popular_movie_titles = list(english_review_title_counts[english_review_title_counts > third_qt_reviews].index)
unpopular_movie_titles = list(english_review_title_counts[english_review_title_counts <= third_qt_reviews].index)
popular_movie_reviews = english_movies_df[english_movies_df["title"].isin(popular_movie_titles)]
unpopular_movie_reviews = english_movies_df[english_movies_df["title"].isin(unpopular_movie_titles)]
popular_movie_review_count = len(popular_movie_reviews)
unpopular_movie_review_count = len(unpopular_movie_reviews)

print(f"Total reviews for english-titled movies: {english_movies_reviews_count:,}")
print(f"Reviews on popular movies with more than the third-quartile amount of reviews (> {third_qt_reviews:.0f} reviews): {popular_movie_review_count:,}")
print(f"Reviews on unpopular movies with less than or exactly the third-quartile amount of reviews (<= {third_qt_reviews:.0f} reviews): {unpopular_movie_review_count:,}")
print(f"Reviews on popular movies make up {popular_movie_review_count / english_movies_reviews_count * 100:.2f}% of all reviews")

Total reviews for english-titled movies: 65,333
Reviews on popular movies with more than the third-quartile amount of reviews (> 2 reviews): 62,942
Reviews on unpopular movies with less than or exactly the third-quartile amount of reviews (<= 2 reviews): 2,391
Reviews on popular movies make up 96.34% of all reviews


# Correlating Review Date And Avg. Rating

In [370]:
# Review date EDA
# Extract date pieces from review_date
rating_date_df = english_movies_df
rating_date_df["month"] = pd.DatetimeIndex(rating_date_df["review_date"]).month
rating_date_df["day"] = pd.DatetimeIndex(rating_date_df["review_date"]).day
rating_date_df["weekday"] = rating_date_df["review_date"].dt.dayofweek # By Default 0 is Monday, 6 is Sunday
rating_date_df.head()

Unnamed: 0,movie,rating,review_summary,review_date,spoiler_tag,review_detail,year,title,month,day,weekday
0,The Droving (2020),2,An honest review,2020-05-03,False,Here's the truth. There's not much to this mov...,2020,The Droving,5,3,6
1,All About Eve (1950),10,Amazing,2020-05-03,False,Having seen this film for the first time today...,1950,All About Eve,5,3,6
2,Runaway Train (1985),7,Impressive action scenes!,2020-05-03,False,The movie had some very impressive scenes. Esp...,1985,Runaway Train,5,3,6
3,The Half of It (I) (2020),4,Needed the other half of the movie to cover up...,2020-05-03,False,I see that Netflix has a teenage/kids audience...,2020,The Half of It (I),5,3,6
4,Closure (I) (2018),9,Fun and intriguing,2020-05-03,False,This is a fun and intriguing mystery. The acti...,2018,Closure (I),5,3,6


In [371]:
# Does the average review change by month or day of week?
print("Average review rating by month:")
print(rating_date_df.groupby("month").mean()["rating"])

print("\nAverage review rating by day of week:")
print(rating_date_df.groupby("weekday").mean()["rating"])

Average review rating by month:
month
1     6.948276
2     6.558557
3     6.716285
5     6.846822
7     6.461878
8     6.461572
9     6.503283
10    6.566870
12    6.511498
Name: rating, dtype: float64

Average review rating by day of week:
weekday
0    6.693598
1    6.620650
2    6.549804
3    6.573642
4    6.575663
5    6.419656
6    6.477278
Name: rating, dtype: float64
