In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
df = pd.read_csv('../data/imdb_top_6652.csv')
refs = [ref.split('/')[2] for ref in df['href']]

In [3]:
refs[0]

'tt0111161'

## Pulling Title, Overview, Genres, IDs, Popularity Score, Vote Average, Vote Count 

In [61]:
key = '5d02d5a46ec1558878df4206f7c74777'

titles = []
overviews = []
genres = []
imdb_ids = []
popularity_scores = []
vote_averages = []
vote_counts = []

for ref in refs:
    url = f'https://api.themoviedb.org/3/movie/{ref}?api_key={key}&language=en-US'
    res = requests.get(url)
    info = res.json()
    
    titles.append(info['original_title'])
    overviews.append(info['overview'])
    genres.append(info['genres'])
    imdb_ids.append(info['imdb_id'])
    popularity_scores.append(info['popularity'])
    vote_averages.append(info['vote_average'])
    vote_counts.append(info['vote_count'])

In [62]:
len(titles)

6652

In [65]:
movies = pd.DataFrame(titles, columns=['title'])

In [66]:
movies['overviews'] = overviews
movies['genres'] = genres
movies['imdb_id'] = imdb_ids
movies['popularity'] = popularity_scores
movies['vote_average'] = vote_averages
movies['vote_count'] = vote_counts

In [69]:
movies.head()

Unnamed: 0,title,overviews,genres,imdb_id,popularity,vote_average,vote_count
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0111161,111.25,8.7,18845
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0068646,45.416,8.7,14225
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",tt0468569,58.502,8.5,24993
3,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0071562,33.582,8.6,8488
4,12 Angry Men,The defense and the prosecution have rested an...,"[{'id': 18, 'name': 'Drama'}]",tt0050083,24.822,8.5,5539


In [70]:
movies.tail()

Unnamed: 0,title,overviews,genres,imdb_id,popularity,vote_average,vote_count
6647,Manos: The Hands of Fate,A family gets lost on the road and stumbles up...,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",tt0060666,10.272,1.9,142
6648,Birdemic: Shock and Terror,A platoon of eagles and vultures attacks the r...,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",tt1316037,9.624,2.2,220
6649,Pledge This!,"At popular South Beach University, filthy rich...","[{'id': 35, 'name': 'Comedy'}]",tt0417056,9.372,2.8,71
6650,Going Overboard,A struggling young comedian takes a menial job...,"[{'id': 35, 'name': 'Comedy'}]",tt0096870,9.489,2.5,92
6651,Saving Christmas,Kirk is enjoying the annual Christmas party ex...,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",tt4009460,6.087,1.9,57


In [72]:
movies.to_csv('../data/api_data.csv', index=False)

## Pulling Director & Top 3 Cast Members 

In [102]:
movies = pd.read_csv('../data/api_data.csv')

In [91]:
key = key = '5d02d5a46ec1558878df4206f7c74777'

acts = []
dirs = []


for ref in refs:
    url = f'https://api.themoviedb.org/3/movie/{ref}?api_key={key}&append_to_response=credits'
    res = requests.get(url)
    info = res.json()
    
    crew = info['credits']['crew']
    cast = info['credits']['cast']
   
    stars = []
    if len(cast) >= 3:
        for i in range(3):
            stars.append(cast[i]['name'])
    else:
        for i in range(len(cast)):
            stars.append(cast[i]['name'])
    
    director = []
    for i in range(len(crew)):
        if crew[i]['job'] == 'Director':
            director.append(crew[i]['name'])

    acts.append(stars)
    dirs.append(director)
    sleep(0.3)
    

In [98]:
acts[:5]

[['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'],
 ['Al Pacino', 'Marlon Brando', 'James Caan'],
 ['Christian Bale', 'Heath Ledger', 'Michael Caine'],
 ['Al Pacino', 'Robert De Niro', 'Robert Duvall'],
 ['Martin Balsam', 'John Fiedler', 'Lee J. Cobb']]

In [97]:
dirs[:5]

[['Frank Darabont'],
 ['Francis Ford Coppola'],
 ['Christopher Nolan'],
 ['Francis Ford Coppola'],
 ['Sidney Lumet']]

In [103]:
movies['cast'] = acts
movies['director'] = dirs

In [104]:
movies.head()

Unnamed: 0,title,overviews,genres,imdb_id,popularity,vote_average,vote_count,cast,director
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0111161,111.25,8.7,18845,"[Tim Robbins, Morgan Freeman, Bob Gunton]",[Frank Darabont]
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0068646,45.416,8.7,14225,"[Al Pacino, Marlon Brando, James Caan]",[Francis Ford Coppola]
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",tt0468569,58.502,8.5,24993,"[Christian Bale, Heath Ledger, Michael Caine]",[Christopher Nolan]
3,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0071562,33.582,8.6,8488,"[Al Pacino, Robert De Niro, Robert Duvall]",[Francis Ford Coppola]
4,12 Angry Men,The defense and the prosecution have rested an...,"[{'id': 18, 'name': 'Drama'}]",tt0050083,24.822,8.5,5539,"[Martin Balsam, John Fiedler, Lee J. Cobb]",[Sidney Lumet]


In [110]:
movies.to_csv('../data/api_data.csv', index=False)

# Cleaning  

### Cleaning Director Column 

In [120]:
# 365 movies have more than 1 director in the director column
counter = 0
for d in movies['director']:
    if len(d) > 1:
        counter += 1
counter

365

In [182]:
#function returns a list of strings, with each directors first name and last name joined
#joining the first and last name will make each director's name unique when vectorizing
def clean_list_name(df, col):
    old = [name for name in df[col]]
    new = []
    for group in old:
        joined = ''
        for person in group:
            joined += person.replace(' ', '') + ' '
        new.append(joined)
    return new

In [184]:
movies['join_director'] = clean_list_name(movies, 'director')

In [186]:
movies['join_director'][:5]

0         FrankDarabont 
1    FrancisFordCoppola 
2      ChristopherNolan 
3    FrancisFordCoppola 
4           SidneyLumet 
Name: join_director, dtype: object

### Cleaning Cast Column 

In [188]:
movies['join_cast'] = clean_list_name(movies, 'cast')

In [189]:
movies['join_cast'][:5]

0        TimRobbins MorganFreeman BobGunton 
1           AlPacino MarlonBrando JamesCaan 
2    ChristianBale HeathLedger MichaelCaine 
3        AlPacino RobertDeNiro RobertDuvall 
4        MartinBalsam JohnFiedler LeeJ.Cobb 
Name: join_cast, dtype: object

In [194]:
# exporting movies
movies.to_csv('../data/api_data.csv', index=False)

In [None]:
#was running into issues with getting the writers of the movie, will come back to this
write = []
for i in range(len(crew)):
    if crew[i]['job'] == 'Screenplay':
            write.append(crew[i]['name'])
    
if len(write) == 0:
    for i in range(len(crew)):
        if crew[i]['job'] == 'Novel':
            write.append(crew[i]['name'])

In [4]:
df = pd.read_csv('../data/api_data.csv')
db = pd.read_csv('../data/imdb_6k.csv')

In [6]:
df.head()

Unnamed: 0,title,overviews,genres,imdb_id,popularity,vote_average,vote_count,cast,director,join_director,join_cast
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0111161,111.25,8.7,18845,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']",['Frank Darabont'],FrankDarabont,TimRobbins MorganFreeman BobGunton
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0068646,45.416,8.7,14225,"['Al Pacino', 'Marlon Brando', 'James Caan']",['Francis Ford Coppola'],FrancisFordCoppola,AlPacino MarlonBrando JamesCaan
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",tt0468569,58.502,8.5,24993,"['Christian Bale', 'Heath Ledger', 'Michael Ca...",['Christopher Nolan'],ChristopherNolan,ChristianBale HeathLedger MichaelCaine
3,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",tt0071562,33.582,8.6,8488,"['Al Pacino', 'Robert De Niro', 'Robert Duvall']",['Francis Ford Coppola'],FrancisFordCoppola,AlPacino RobertDeNiro RobertDuvall
4,12 Angry Men,The defense and the prosecution have rested an...,"[{'id': 18, 'name': 'Drama'}]",tt0050083,24.822,8.5,5539,"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb']",['Sidney Lumet'],SidneyLumet,MartinBalsam JohnFiedler LeeJ.Cobb
