# Packages

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from requests import TooManyRedirects
import re
import omdb
import time
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from string import punctuation
sw = stopwords.words("english")

# API Set-up

In [None]:
api_key = "2831f563" # Jimmy's key - limit 1000 calls a day
omdb.set_default('apikey', api_key)

# include tomatoes data by default
#omdb.set_default('tomatoes', True)

# Pulling Data from API

In [None]:
# set timeout of 5 seconds for this request
# Pull 500 pages of movies (5000 movies) with the word 'one' in the title
imdb_ids = []
for i in range(1,501):
    year_df = pd.DataFrame(omdb.search_movie('one', page=i, timeout=5))
    # Store the ids in a list
    imdb_ids.append(year_df['imdb_id'].tolist())
imdb_ids = sum(imdb_ids, [])

In [None]:
# Use ids to obtain movie information - 
movies_info = pd.DataFrame([omdb.imdbid(i) for i in imdb_ids])

# View the dataframe
movies_info

Unnamed: 0,title,year,rated,released,runtime,genre,director,writer,actors,plot,...,metascore,imdb_rating,imdb_votes,imdb_id,type,dvd,box_office,production,website,response
0,One Flew Over the Cuckoo's Nest,1975,R,19 Nov 1975,133 min,Drama,Milos Forman,"Lawrence Hauben, Bo Goldman, Ken Kesey","Jack Nicholson, Louise Fletcher, Michael Berryman",A criminal pleads insanity and is admitted to ...,...,84,8.7,983642,tt0073486,movie,16 Dec 1997,"$108,981,275",,,True
1,Rogue One: A Star Wars Story,2016,PG-13,16 Dec 2016,133 min,"Action, Adventure, Sci-Fi",Gareth Edwards,"Chris Weitz, Tony Gilroy, John Knoll","Felicity Jones, Diego Luna, Alan Tudyk","In a time of conflict, a group of unlikely her...",...,65,7.8,607401,tt3748528,movie,04 Apr 2017,"$532,177,324",,,True
2,Ready Player One,2018,PG-13,29 Mar 2018,140 min,"Action, Adventure, Sci-Fi",Steven Spielberg,"Zak Penn, Ernest Cline","Tye Sheridan, Olivia Cooke, Ben Mendelsohn",When the creator of a virtual reality called t...,...,64,7.4,422237,tt1677720,movie,05 Apr 2018,"$137,715,350",,,True
3,Let the Right One In,2008,R,12 Dec 2008,114 min,"Drama, Fantasy, Horror",Tomas Alfredson,John Ajvide Lindqvist,"Kåre Hedebrant, Lina Leandersson, Per Ragnar","Oskar, an overlooked and bullied boy, finds lo...",...,82,7.9,214860,tt1139797,movie,10 Mar 2009,"$2,122,065",,,True
4,Air Force One,1997,R,25 Jul 1997,124 min,"Action, Drama, Thriller",Wolfgang Petersen,Andrew W. Marlowe,"Harrison Ford, Gary Oldman, Glenn Close",Communist radicals hijack Air Force One with t...,...,61,6.5,193375,tt0118571,movie,26 Dec 2000,"$172,956,409",,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,One Billion Dates,2015,,,9 min,"Short, Comedy, Drama",Alec Robbins,Alec Robbins,"Alec Robbins, Eric Lewis-Baker, Jack Amedio",What if you ended up with the wrong person bec...,...,,,,tt4882004,movie,,,,,True
4996,One Shot,2015,,31 Jul 2015,,"Short, Drama",George McCluskey,George McCluskey,"Greg Hobbs, Jayne Machin",,...,,,,tt4956210,movie,,,,,True
4997,One More Win,2015,,01 Sep 2015,91 min,Documentary,Scott Kelso,,,World Rugby has written the Cook Islanders off...,...,,,,tt4956268,movie,,,,,True
4998,Changing History: One Heart at a Time,2015,,09 Jul 2015,19 min,"Short, History",Adam Kavanagh,James Standish,"Dora Amuimuia, Craig Walker","Changing History - one heart at a time, is a s...",...,,,,tt4957136,movie,,,,,True


In [None]:
punctuation = set(punctuation) 
# Text cleaning function
def clean_text_data(column):
    new_description = []
    for description in column:
        update_desc = description
        # Remove the punctuation from each description
        for i in description:
            if i in punctuation:
                update_desc = update_desc.replace(i, "")
        # Remove extra white space
        update_desc = re.sub(r'\s+', ' ', update_desc)
        # Split on whitespace
        update_desc = update_desc.split()
        # Fold to lowercase
        for i in range(len(update_desc)):
            update_desc[i] = update_desc[i].lower()
        # Remove stopwords
        update_desc = [i for i in update_desc if i not in sw]
        new_description.append(update_desc)
    return new_description

In [None]:
# Clean the plot description and genre text
movies_info['cleaned_plot'] = clean_text_data(movies_info['plot'])
movies_info['cleaned_genre'] = clean_text_data(movies_info['genre'])

# Keep only the first word in the cleaned genre lists
movies_info['first_genre'] = [i[0] for i in movies_info['cleaned_genre']]

# Create new df with only the first_genre and cleaned_plot columns
cleaned_df = movies_info[['first_genre', 'cleaned_plot', 'imdb_rating']]
cleaned_df

Unnamed: 0,first_genre,cleaned_plot,imdb_rating
0,drama,"[criminal, pleads, insanity, admitted, mental,...",8.7
1,action,"[time, conflict, group, unlikely, heroes, band...",7.8
2,action,"[creator, virtual, reality, called, oasis, die...",7.4
3,drama,"[oskar, overlooked, bullied, boy, finds, love,...",7.9
4,action,"[communist, radicals, hijack, air, force, one,...",6.5
...,...,...,...
4995,short,"[ended, wrong, person, truly, thought, couldnt...",
4996,short,[na],
4997,documentary,"[world, rugby, written, cook, islanders, game,...",
4998,short,"[changing, history, one, heart, time, short, d...",


In [None]:
# Summary of each column
cleaned_df.describe()

Unnamed: 0,first_genre,cleaned_plot,imdb_rating
count,5000,5000,5000.0
unique,26,3119,82.0
top,short,[na],
freq,2059,1443,2559.0


In [None]:
# Count instances of each genre
cleaned_df['first_genre'].value_counts()

short          2059
documentary     954
drama           515
comedy          459
animation       186
na              159
action          159
crime            76
music            71
sport            60
horror           50
adventure        40
thriller         35
family           30
biography        27
romance          26
scifi            21
western          18
musical          13
mystery          12
fantasy          11
history           6
talkshow          5
realitytv         4
news              3
gameshow          1
Name: first_genre, dtype: int64

In [None]:
# Include only the top 5 genres, excluding shorts and na
cleaned_df = cleaned_df.loc[cleaned_df['first_genre'].isin(['documentary', 'drama', 'comedy', 'animation', 'action'])]
cleaned_df['first_genre'].value_counts()

2273

In [None]:
# Write to a csv
cleaned_df.to_csv('Cleaned Plot Data.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=53ef1faa-b7a9-4637-b54c-3b067e58670d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>