In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

In [8]:
movies = pd.read_csv('../data/imdb_top_6652.csv')
movies.shape

(6652, 3)

## Grabbing User Plot Summaries 

In [10]:
refs = [ref for ref in movies['href']]

plot_summaries = []
for i in refs:
    url = f'https://www.imdb.com{i}plotsummary'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    for i in soup.find_all('ul'):
        try:
            if i['id'] == 'plot-summaries-content':
                sums = i
                plots = []
                for plot in sums.find_all('p'):
                    plots.append(plot.text)
                plot_summaries.append(plots)
        except:
            pass

In [17]:
movies['user_plots'] = plot_summaries

In [18]:
movies.head()

Unnamed: 0,href,title,genre,user_plots
0,/title/tt0111161/,The Shawshank Redemption,Drama,[Two imprisoned men bond over a number of year...
1,/title/tt0068646/,The Godfather,Crime Drama,[An organized crime dynasty's aging patriarch ...
2,/title/tt0468569/,The Dark Knight,Action Crime Drama,[When the menace known as the Joker wreaks hav...
3,/title/tt0071562/,The Godfather: Part II,Crime Drama,[The early life and career of Vito Corleone in...
4,/title/tt0050083/,12 Angry Men,Crime Drama,[A jury holdout attempts to prevent a miscarri...


## Grabbing IMDb Full Synopsis

In [118]:
synopsi = []
for ref in refs:
    url = f'https://www.imdb.com{ref}plotsummary'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    synopsis = []
    for i in soup.find_all('ul'):
        try:
            if i['id'] == 'plot-synopsis-content':
                for para in i.find_all('li'):
                    syn = para.get_text()
                    synopsis.append(str(syn))
        except:
            pass
    synopsi.append(synopsis[0])

In [120]:
movies['synopsis'] = synopsi

In [140]:
movies.head()

Unnamed: 0,href,title,genre,user_plots,synopsis
0,/title/tt0111161/,The Shawshank Redemption,Drama,[Two imprisoned men bond over a number of year...,"In 1947, Andy Dufresne (Tim Robbins), a banker..."
1,/title/tt0068646/,The Godfather,Crime Drama,[An organized crime dynasty's aging patriarch ...,"In late summer 1945, guests are gathered for t..."
2,/title/tt0468569/,The Dark Knight,Action Crime Drama,[When the menace known as the Joker wreaks hav...,The movie begins with a gang of men with clown...
3,/title/tt0071562/,The Godfather: Part II,Crime Drama,[The early life and career of Vito Corleone in...,The Godfather Part II presents two parallel st...
4,/title/tt0050083/,12 Angry Men,Crime Drama,[A jury holdout attempts to prevent a miscarri...,"In a New York City courthouse, an eighteen-yea..."


In [124]:
movies.to_csv('../data/imdb_6k.csv')

## Grabbing Directors, Writers, Stars

In [3]:
movies = pd.read_csv('../data/imdb_6k.csv')
refs = [ref for ref in movies['href']]

In [50]:
#all info will be a list of lists that contains the director, writers and starts of every film in my movie df

all_info = []
for ref in refs:
    url = f'https://www.imdb.com{ref}'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    movie_info = []
    for text in soup.find_all('div', {'class' : 'credit_summary_item'}):
        try:
            movie_info.append(text.get_text(strip=True))
        except:
            pass
    all_info.append(movie_info)

In [68]:
#example of the first list within all_info
all_info[0]

['Director:Frank Darabont',
 'Writers:Stephen King(short story "Rita Hayworth and Shawshank Redemption"),Frank Darabont(screenplay)',
 'Stars:Tim Robbins,Morgan Freeman,Bob Gunton|See full cast & crew»']

In [71]:
#index 70 does not contain 3 lists (director, writers, stars)
for i in range(len(all_info)):
    if len(all_info[i]) != 3:
        print(i)

70


In [76]:
#this movie only has the director, will fill empty slots with 'NONE' for now for simplicity
#will potentially drop film later on as its a short animated film and does not necessarily fit in with the others
movies['title'][70], all_info[70]

("It's Such a Beautiful Day", ['Director:Don Hertzfeldt'])

In [95]:
#commented this cell out to avoid appending to the list again

#all_info[70].append('NONE')
#all_info[70].append('NONE')

In [103]:
all_info[70]

['Director:Don Hertzfeldt', 'NONE', 'NONE']

In [98]:
directors = [all_info[i][0] for i in range(len(all_info))]
writers = [all_info[i][1] for i in range(len(all_info))]
stars = [all_info[i][2] for i in range(len(all_info))]

In [101]:
#all lists are the same length, aka each movie is accounted for
len(directors), len(writers), len(stars)

(6652, 6652, 6652)

In [102]:
#each list in all info now has 3 values (director, writers, stars)
for i in range(len(all_info)):
    if len(all_info[i]) != 3:
        print(i)

In [105]:
movies['director'] = directors
movies['writers'] = writers
movies['stars'] = stars

In [107]:
movies.drop(columns='Unnamed: 0', inplace=True)

In [109]:
movies.head()

Unnamed: 0,href,title,genre,user_plots,synopsis,director,writers,stars
0,/title/tt0111161/,The Shawshank Redemption,Drama,['Two imprisoned men bond over a number of yea...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",Director:Frank Darabont,"Writers:Stephen King(short story ""Rita Haywort...","Stars:Tim Robbins,Morgan Freeman,Bob Gunton|Se..."
1,/title/tt0068646/,The Godfather,Crime Drama,"[""An organized crime dynasty's aging patriarch...","In late summer 1945, guests are gathered for t...",Director:Francis Ford Coppola,"Writers:Mario Puzo(screenplay by),Francis Ford...","Stars:Marlon Brando,Al Pacino,James Caan|See f..."
2,/title/tt0468569/,The Dark Knight,Action Crime Drama,['When the menace known as the Joker wreaks ha...,The movie begins with a gang of men with clown...,Director:Christopher Nolan,"Writers:Jonathan Nolan(screenplay),Christopher...","Stars:Christian Bale,Heath Ledger,Aaron Eckhar..."
3,/title/tt0071562/,The Godfather: Part II,Crime Drama,['The early life and career of Vito Corleone i...,The Godfather Part II presents two parallel st...,Director:Francis Ford Coppola,"Writers:Francis Ford Coppola(screenplay by),Ma...","Stars:Al Pacino,Robert De Niro,Robert Duvall|S..."
4,/title/tt0050083/,12 Angry Men,Crime Drama,['A jury holdout attempts to prevent a miscarr...,"In a New York City courthouse, an eighteen-yea...",Director:Sidney Lumet,"Writers:Reginald Rose(story),Reginald Rose(scr...","Stars:Henry Fonda,Lee J. Cobb,Martin Balsam|Se..."


In [110]:
#exporting new updated csv file 
movies.to_csv('../data/imdb_6k.csv', index=False)