# Packages

In [1]:
import pandas as pd
import numpy as np
import requests
from requests import TooManyRedirects
import re
import omdb
import time
from collections import Counter, defaultdict
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
sw = stopwords.words("english")

---- 
# Loading the Raw Data from API

In [2]:
# Read in csv data as pandas data frame
movies_info = pd.read_csv("Raw Movie Data.csv")
# see a random subset of 15 samples
movies_info.sample(15)

Unnamed: 0,title,year,rated,released,runtime,genre,director,writer,actors,plot,...,metascore,imdb_rating,imdb_votes,imdb_id,type,dvd,box_office,production,website,response
1055,Impractical Jokers: One Night at the Grand,2016,,01 Sep 2016,,Comedy,Shannon Hartman,,"Aiden, Erica, Joe Gatto, James Murray",,...,,8.3,67.0,tt6107270,movie,,,,,True
2674,On the One Road,2016,,,,Comedy,Jonathan Kesselman,"Mark Doherty, Jonathan Kesselman",,An American efficiency expert and an Irishman ...,...,,,,tt5044850,movie,,,,,True
4932,Please Take One,1910,,15 Jul 1910,,"Short, Comedy",,,,Bonehead is not a success in distributing hand...,...,,,,tt4813618,movie,,,,,True
2997,First One Out,2019,,,24 min,"Short, Drama",David Ngandu,David Ngandu,David Ngandu,A young man unwillingly finds himself in the d...,...,,,,tt10417936,movie,,,,,True
3416,One Way Street,1980,,,12 min,"Short, Animation",Bernard Longpré,,,,...,,,,tt0226248,movie,,,,,True
3287,One for You and One for Me,2010,,21 Mar 2010,6 min,"Short, Drama, Family",Trent,Trent,"Dallas D Munger, Lori Desiderio, Ashley Switzer",A man reconsiders how he treats his family aft...,...,,,,tt1631392,movie,,,,,True
2796,The Naked One,2011,,04 Jul 2011,,"Short, Sci-Fi",Pavel Safonov,"Keith Odett, Isaac Pentland, Pavel Safonov","Jan Lashly, Keith Odett, Diedre Willig",,...,,,,tt2106709,movie,,,,,True
4691,Walter Sickert & The Army of Broken Toys: Dino...,2018,,28 Oct 2018,5 min,"Short, Fantasy, Music","Edrie Edrie, Walter Sickert",,"Edrie Edrie, Blake Girndt, Belle Gunz",,...,,,,tt12150946,movie,,,,,True
4266,Spent a Year There One Knite,2014,,,44 min,"Documentary, Music",Stu Gilbert,Stu Gilbert (story),,Spent A Year There One Knite resurrects the On...,...,,,,tt3691836,movie,,,,,True
1246,"One Cooks, the Other Doesn't",1983,,27 Sep 1983,120 min,"Comedy, Drama, Romance",Richard Michaels,Larry Grusin,"Rosanna Arquette, Allyce Beasley, Joseph Bolog...",,...,,6.0,42.0,tt0086049,movie,,,,,True


------
# Exploratory Data Analysis

1. Examine a five-number summary of the numerical and categorical columns
2. Checking for Missing Data

## 1. Calculating Summary Statistics for Columns

In [3]:
# Create a new column to look at the length of each plot
movies_info['plot_length'] = movies_info['plot'].str.len()

# 5 number summary of the numerical columns
movies_info.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,5000.0,2002.897,23.082681,1887.0,2002.0,2011.0,2016.0,2023.0
metascore,115.0,60.026087,18.559747,16.0,47.0,62.0,74.0,93.0
imdb_rating,2442.0,6.591155,1.386458,1.0,5.8,6.7,7.5,10.0
plot_length,3560.0,159.808989,70.963086,16.0,109.0,170.0,208.0,1324.0


**Interpretation**
1. **Year:** The range of the movies pulled from the API is from the year 1887 to 2023. This may seem plausible, but requires more drilling down in the data to figure out if the first movie ever was actually made in 1887. For movies in the year 2023, this may be upcoming movies that will be released then.

2. **Metascore**: The metascore is a weighted average of many reviews coming from reputed critics. The Metacritic team reads the reviews and assigns each a 0–100 score, which is then given a weight, mainly based on the review's quality and source. That means the higher the metascore, the more positive reviews a movie has. In our summary, we can see that the range for our movies in this sample is from 16 as the lowest and 93 as the highest. The average metascore is 60, where as the median is 62. This can be interesting later as we dive into the average metascore over time.

3. **imdb_rating**: IMDB rating allow users to rate films on a scale of 1-10. As expected, the range for this variable is 1 as the lowest and 10 as the highest. However, the average IMDB rating is 6.6 and the median is 6.7.

4. **plot_length**: This column displays the length of each movies' plot. Movies plots length range from 16 words as the lowest to 208 as the highest. On average, a movie plot has the length of 160 words whereas the median is 170. This could also indicate that a longer plot description will provide more information to understanding the movies' genres.

In [4]:
movies_info[['awards','runtime', 'language', 'country']].describe(include = 'O').T

Unnamed: 0,count,unique,top,freq
awards,840,177,1 nomination,139
runtime,3826,173,4 min,163
language,4586,233,English,3414
country,4797,321,USA,1444


**Interpretation**

1.**awards** This variable shows that 139 movies out of 5000 were able to receive 1 nomination for an award. However, due to the number of unique values, we may need to consider that awards recorded down for each movie is not consistent since this has a high cardinality. Therefore, this may not be a reliable insight for the awards variable

2. **runtime** This variable also sees a high cardinality, but at a quick glance we can see that there are 163 movies that has a runtime of only 4 minutes. 

3. **language** There number of unique languages here is 233, while that may seem plausible it is also expected to see that movies in English was most prevalent.

4. **country** Understandably, the country with the most movies are from the United States of America (USA). Exactly 1444 movies out of 5000 in this API sample are American.

## 2. Checking for Missing Data

In [5]:
movies_info.isna().sum()

title             0
year              0
rated          4245
released       1349
runtime        1174
genre           159
director        415
writer         1435
actors          754
plot           1440
language        414
country         203
awards         4160
poster         2180
ratings           0
metascore      4885
imdb_rating    2558
imdb_votes     2446
imdb_id           0
type              0
dvd            4497
box_office     4884
production     4965
website        4996
response          0
plot_length    1440
dtype: int64

**Interpretation**


# Preparing Texual Data for Statistics and Modeling

In [6]:
punctuation = set(punctuation) 
# Remove nas in plot and genre columns
movies_info['plot'] = movies_info['plot'].fillna(' ')
movies_info['genre'] = movies_info['genre'].fillna(' ')
# Text cleaning function
def clean_text_data(column):
    new_description = []
    for description in column:
        update_desc = description
        # Remove the punctuation from each description
        for i in description:
            if i in punctuation:
                update_desc = update_desc.replace(i, "")
        # Remove extra white space
        update_desc = re.sub(r'\s+', ' ', update_desc)
        # Split on whitespace
        update_desc = update_desc.split()
        # Fold to lowercase
        for i in range(len(update_desc)):
            update_desc[i] = update_desc[i].lower()
        # Remove stopwords
        update_desc = [i for i in update_desc if i not in sw]
        new_description.append(update_desc)
    return new_description

In [7]:
#Remove empty lists from cleaned_genre
movies_info = movies_info[movies_info['genre'] != ' ']

# Clean the plot description and genre text
movies_info['cleaned_plot'] = clean_text_data(movies_info['plot'])
movies_info['cleaned_genre'] = clean_text_data(movies_info['genre'])

# Keep only the first word in the cleaned genre lists
movies_info['first_genre'] = [i[0] for i in movies_info['cleaned_genre']]

# Create new df with only the first_genre and cleaned_plot columns
cleaned_df = movies_info[['title', 'first_genre', 'cleaned_plot', 'imdb_rating']]
cleaned_df.sample(15)

Unnamed: 0,title,first_genre,cleaned_plot,imdb_rating
3735,One Man's Challenge,short,[],
1741,I'm Not One of You,short,"[im, one, trying, tell, overwhelming, pressure...",8.8
245,Blue Collar Comedy Tour: One for the Road,documentary,"[bill, engvall, ron, tater, salad, white, jeff...",7.3
2892,With No One in the World,short,"[running, away, cops, bleeding, death, michael...",
2330,"One Hot Rotting, Zombie Love Song",short,"[jonas, kindhearted, zombie, sick, tired, kill...",5.4
1727,The Big One,fantasy,"[anna, buried, beloved, aunt, california, wait...",6.4
4316,One Night Rodeo Official Music Video,short,[],
4127,One Night,short,"[unexpected, encounter, offers, walk, ease, mi...",
2362,One More Day,romance,[],4.3
1228,One Way to Love,comedy,"[chicago, team, radio, scriptwriters, must, sp...",7.2


In [8]:
# Summary of each column
cleaned_df.describe()

Unnamed: 0,imdb_rating
count,2394.0
mean,6.594319
std,1.379787
min,1.0
25%,5.8
50%,6.7
75%,7.5
max,10.0


In [9]:
# Count instances of each genre
cleaned_df['first_genre'].value_counts()

short          2063
documentary     957
drama           514
comedy          456
animation       185
action          159
crime            75
music            71
sport            60
horror           51
adventure        40
thriller         35
family           30
biography        27
romance          25
scifi            20
western          18
musical          13
mystery          12
fantasy          11
history           6
talkshow          5
realitytv         4
news              3
gameshow          1
Name: first_genre, dtype: int64

In [10]:
# Include only the top 5 genres, excluding shorts and na
cleaned_df = cleaned_df.loc[cleaned_df['first_genre'].isin(['documentary', 'drama', 'comedy', 'animation', 'action'])]
cleaned_df['first_genre'].value_counts()

documentary    957
drama          514
comedy         456
animation      185
action         159
Name: first_genre, dtype: int64

In [11]:
# Write to a csv
cleaned_df.to_csv('Cleaned Plot Data.csv', index = False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=53ef1faa-b7a9-4637-b54c-3b067e58670d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>