# Web Scraping

### Import Dependencies 

In [1]:
!apt update
!pip install selenium
!pip install webdriver-manager

'apt' is not recognized as an internal or external command,
operable program or batch file.




In [2]:
import random
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium import webdriver

#因为我用的edge浏览器，所以这里用的是edge driver，如果你电脑里装的是chrome，要换成chrome driver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager

In [3]:
info_df = pd.read_csv("../tmdb_5000_data/tmdb_5000_movies.csv")
info_df = info_df.drop_duplicates()
info_df = info_df[['id', 'title','release_date']]
info_df['release_date'] = pd.to_datetime(info_df['release_date'], format = "%Y-%m-%d").dt.strftime('%Y')
info_df.rename(columns= {'id':'movie_id', 'release_date':'startYear'}, inplace = True)
info_df["storyline"] = np.nan
info_df['storyline'] = info_df['storyline'].astype('object')
info_df.head(10)

Unnamed: 0,movie_id,title,startYear,storyline
0,19995,Avatar,2009,
1,285,Pirates of the Caribbean: At World's End,2007,
2,206647,Spectre,2015,
3,49026,The Dark Knight Rises,2012,
4,49529,John Carter,2012,
5,559,Spider-Man 3,2007,
6,38757,Tangled,2010,
7,99861,Avengers: Age of Ultron,2015,
8,767,Harry Potter and the Half-Blood Prince,2009,
9,209112,Batman v Superman: Dawn of Justice,2016,


In [4]:
imdb_df = pd.read_csv('../tmdb_5000_data/data.tsv', sep = '\t')
imdb_df = imdb_df[imdb_df['titleType'] == 'movie']
imdb_df = imdb_df[['tconst', 'primaryTitle','startYear']]
imdb_df['startYear'] = imdb_df['startYear'].replace('\n','').astype(str)
imdb_df.rename(columns= {'tconst':'imdb_id', 'primaryTitle':'title'}, inplace = True)
imdb_df.head(10)

  imdb_df = pd.read_csv('../tmdb_5000_data/data.tsv', sep = '\t')


Unnamed: 0,imdb_id,title,startYear
8,tt0000009,Miss Jerry,1894
144,tt0000147,The Corbett-Fitzsimmons Fight,1897
498,tt0000502,Bohemios,1905
570,tt0000574,The Story of the Kelly Gang,1906
587,tt0000591,The Prodigal Son,1907
610,tt0000615,Robbery Under Arms,1907
625,tt0000630,Hamlet,1908
668,tt0000675,Don Quijote,1908
672,tt0000679,The Fairylogue and Radio-Plays,1908
828,tt0000838,A Cultura do Cacau,1909


In [5]:
full_df = info_df.merge(imdb_df, how = 'inner',on = ['title', 'startYear'])
full_df = full_df.reset_index()
full_df.head()

Unnamed: 0,index,movie_id,title,startYear,storyline,imdb_id
0,0,19995,Avatar,2009,,tt0499549
1,1,285,Pirates of the Caribbean: At World's End,2007,,tt0449088
2,2,206647,Spectre,2015,,tt2379713
3,3,49026,The Dark Knight Rises,2012,,tt1345836
4,4,49529,John Carter,2012,,tt0401729


### Define url function

In [6]:
def get_url(movie_name):
    url_template = "https://www.imdb.com/title/{}/"
    url = url_template.format(movie_name)
    return url

### Scrape movie reviews from critics

In [None]:
import re

#create driver instance.
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))

#iterate through index, reverse() is called so it is easier to see the result and debug. 
indexes = full_df.index.tolist()
indexes.reverse()
movie_num=0
for i in indexes:
    movie_num += 1
    name_og = full_df['title'].iloc[i]
    imdb_id = full_df['imdb_id'].iloc[i]
    
    driver.get(get_url(imdb_id)) #The entire website's html file
    
    #driver.implicitly_wait(0.1)     #Wait time to avoid human verification, 
    storylines = driver.find_elements(By.CLASS_NAME, 'ipc-html-content-inner-div') #Find specific element/elements by class name
    story_collections = []     
    for storyline in storylines:
        result_html = storyline.get_attribute('innerHTML')
        soup = BeautifulSoup(result_html, 'html.parser') #Process the driver-returned elements.
        
        story_collections.append(str(soup).strip())
    
    full_df.at[i, 'storyline'] = story_collections
    print("{}: story added for movie: {}".format(movie_num, name_og))
driver.quit()

1: story added for movie: Shanghai Calling
2: story added for movie: Newlyweds
3: story added for movie: El Mariachi
4: story added for movie: Cavite
5: story added for movie: Primer
6: story added for movie: Bang
7: story added for movie: Cure
8: story added for movie: Tin Can Man
9: story added for movie: The Circle
10: story added for movie: Clean
11: story added for movie: Clean
12: story added for movie: Pink Flamingos
13: story added for movie: All Superheroes Must Die
14: story added for movie: Breaking Upwards


In [None]:
full_df

### Save results

In [None]:
# Convert the dataframe to a csv file
date = datetime.today().strftime('%Y-%m-%d')
full_df.to_csv(date + "movie_with_imdb_storyline.csv", index=False)

### Some unfinished data cleaning process

In [None]:
# read csv 
full_df = pd.read_csv(date +"movie_with_imdb_storyline.csv")

In [None]:
len(full_df.loc[0]['reviews'])

In [None]:
for i in indexes:
    if len(full_df.loc[i]['reviews']) < 4:
        full_df.drop(i, axis =0, inplace = True)
full_df