In [1]:
#Import Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
#Pull in Final FilmStruck library from 11/09/18 scrape
fs = pd.read_csv('/Users/kerrydriscoll/Desktop/filmstruck_movies_2018-11-09.csv', index_col=0)

fs.head()

Unnamed: 0,Title,Year
0,(nostalgia),1971
1,(The [End) of History Illusion]: Miu Miu Women...,2017
2,...And the Pursuit of Happiness,1986
3,...But Film is My Mistress,2010
4,13 Days in France,1968


In [3]:
#Pull in Criterion Website
url = "https://www.criterion.com/shop/browse/list?sort=spine_number"

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
#Select the table from the webpage
table = soup.find_all('table')[0]

#Recreating column names from the table

columns = [list(filter(None, i.get_text().split('\n'))) for i in table.find_all('thead')][0]
columns

#create empty dataframe with column names from the header
criterion = pd.DataFrame(columns=columns)
criterion


Unnamed: 0,Spine #,Title,Director,Year,Country


In [5]:
#Pulling the movie info
rows = table.find_all('tr', {'class' : 'gridFilm'})

#for row in rows:
for row in rows:
    movie_row = row.get_text().replace('\t','').split('\n')
    #when information is missing from a movie fill in with a blank
    positions = [2,9,14,17,20]
    for p in positions:
        if movie_row[p] == '':
            movie_row.insert(p, '')
            
    #extract the information for the movie
    spine = movie_row[2]
    title = movie_row[9]
    director = movie_row[14]
    year = movie_row[17]
    country = movie_row[20]
    
    #create a data frame for the individual movie
    movie_dict = dict(zip(columns, [spine, title, director, year, country]))
    df_temp = pd.DataFrame.from_dict(movie_dict, orient='index').transpose()
    
    #add movie to the overall criterion collection
    criterion = pd.concat([criterion, df_temp], ignore_index = True, sort=False)
 
 

#Make Number Columns Numeric
criterion['Spine #'] = pd.to_numeric(criterion['Spine #'], errors = 'coerce', downcast = 'integer')
criterion['Year'] = pd.to_numeric(criterion['Year'], errors = 'coerce', downcast = 'integer')

#Clean Titles/Remove Extra Characters
criterion['Title'] = [i.strip() for i in criterion['Title']]

#sometimes the movie year can vary a little bit depending on the source, so I am going to create a one year buffer 
criterion['Year_low'], criterion['Year_high'] = criterion['Year'] - 1, criterion['Year'] + 1


criterion.head()

Unnamed: 0,Spine #,Title,Director,Year,Country,Year_low,Year_high
0,1.0,Grand Illusion,Jean Renoir,1937.0,France,1936.0,1938.0
1,2.0,Seven Samurai,Akira Kurosawa,1954.0,Japan,1953.0,1955.0
2,3.0,The Lady Vanishes,Alfred Hitchcock,1938.0,,1937.0,1939.0
3,4.0,Amarcord,Federico Fellini,1973.0,Italy,1972.0,1974.0
4,5.0,The 400 Blows,François Truffaut,1959.0,,1958.0,1960.0


In [6]:
#Quick Check of number of films available through each curator
len(fs), len(criterion)

(2219, 1295)

In [7]:
#Create a variable that determines if a FilmStruck film is also a part of the Criterion Collection

#Merge FilmStruck Library with Criterion library, by Title for now (keeping in mind that there maybe different movies with the same title)
test = pd.merge(fs, criterion, on='Title', how='left')
test.rename(columns={'Year_x': 'Year'}, inplace=True)
test.drop('Year_y', axis=1, inplace=True)
#Determine if a Film with a matched title is actually in Criterion by looking at the Year. 
#If the Year in FilmStruck is +/- 1 away from the year in Criterion, then we'll say it's the same film
test['in_criterion'] = test.apply(lambda row: True if ((row['Year'] >= row['Year_low']) & (row['Year'] <= row['Year_high'])) else False, axis=1)


#there's an issue where there are two different movies by different directors with the same title 
#in the criterion collection, which creates duplicates in the merge process - this fixes that issue
duplicates = list(test[test.duplicated(subset=['Title', 'Year'])]['Title'].unique())
test['dupes'] = test.apply(lambda row: (row['Title'] in duplicates) & (row['in_criterion']==False), axis=1)
test.drop(test[test['dupes']].index, inplace=True)
test = test.reset_index(drop=True)


test = test[['Title', 'Year', 'in_criterion']]
test.head(10)


Unnamed: 0,Title,Year,in_criterion
0,(nostalgia),1971,False
1,(The [End) of History Illusion]: Miu Miu Women...,2017,False
2,...And the Pursuit of Happiness,1986,False
3,...But Film is My Mistress,2010,False
4,13 Days in France,1968,False
5,16 Days of Glory,1986,False
6,2 Days in Paris,2007,False
7,2 or 3 Things I Know About Her,1967,True
8,2010,1984,False
9,21 Days,1940,False


In [8]:
#Quick check to see difference in Criterion and non-Criterion films 
test.groupby('in_criterion')['Year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
in_criterion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,1567.0,1966.066369,25.261153,1907.0,1947.0,1964.0,1985.0,2018.0
True,652.0,1965.150307,20.116606,1921.0,1952.0,1965.0,1978.0,2017.0


In [9]:
#export the augmented FilmStruck library to CSV

test.to_csv('/Users/kerrydriscoll/Desktop/filmstruck_with_criterion_status.csv', encoding='utf-8-sig')