In [171]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

# Define Functions

In [186]:
def scrape_lists(base, path):
    url = urljoin(base, path)
    url_next = None
    soup = BeautifulSoup(requests.get(url, proxies=proxies).content)
    
    titles = [x.text for x in soup.select('.row .articleLink .title')]
    links = [x['href'] for x in soup.select('.row .articleLink')]
    years = [int(x.text[-4:]) for x in soup.select('.row .publication-date')]
    
    
    df1 = (pd.DataFrame({'title': titles, 'link': links, 'year': years}))
    
    # If there is next page, scrape and append to dataframe
    if bool(soup.select_one('.js-wpv-pagination-next-link')):
        next_path = soup.select_one('.js-wpv-pagination-next-link')['href']
        url_next = urljoin(base, next_path)
        df2 = scrape_lists(base, url_next)
        df1 = pd.concat([df1, df2], axis=0, ignore_index=True)
    
    return df1[df1['year'] > 2015]

In [133]:
def get_movies(url):
    proxies = {'http': 'http://206.189.157.23'}
    soup = BeautifulSoup(requests.get(url, proxies=proxies).content)
    return {x.text:x['href'] for x in soup.select('.countdown-item h2 a')}

# Scrape Articles from News Categories

## Scrape All-Time-Lists

In [187]:
proxies = {'http': 'http://206.189.157.23'}
base = 'https://editorial.rottentomatoes.com/'
path = 'all-time-lists'

all_time_lists = scrape_lists(base, path)
all_time_lists

Unnamed: 0,title,link,year
0,The 30 Worst-Reviewed Remakes of All Time,https://editorial.rottentomatoes.com/guide/wor...,2022
1,57 Worst Sequels of All Time,https://editorial.rottentomatoes.com/guide/wor...,2022
2,"All 95 Best Picture Winners, Ranked by Tomato...",https://editorial.rottentomatoes.com/guide/osc...,2022
3,"95 Best Computer-Animated Movies, Ranked by T...",https://editorial.rottentomatoes.com/guide/bes...,2022
4,The 81 Best Asian-American Movies of All Time,https://editorial.rottentomatoes.com/guide/bes...,2022
5,88 Best Heist Movies of All Time,https://editorial.rottentomatoes.com/guide/bes...,2021
6,"84 Best Fantasy Movies of All Time, Ranked",https://editorial.rottentomatoes.com/guide/bes...,2021
7,200 Best Horror Movies of All Time,https://editorial.rottentomatoes.com/guide/bes...,2021
8,150 Best Sports Movies of All Time,https://editorial.rottentomatoes.com/guide/bes...,2021
9,150 Essential Comedy Movies To Watch Now,https://editorial.rottentomatoes.com/guide/ess...,2021


## Scrape Countdowns

In [188]:
proxies = {'http': 'http://206.189.157.23'}
base = 'https://editorial.rottentomatoes.com/'
path = 'countdown'

countdown_lists = scrape_lists(base, path)
countdown_lists

Unnamed: 0,title,link,year
0,24 Batman Animated Movies Ranked,https://editorial.rottentomatoes.com/guide/bat...,2022
1,All 30 Marvel Movies Ranked: See MCU Movies B...,https://editorial.rottentomatoes.com/guide/all...,2022
2,Every Steven Spielberg Movie Ranked by Tomato...,https://editorial.rottentomatoes.com/guide/eve...,2022
3,Time-Travel TV Shows Ranked by Tomatometer,https://editorial.rottentomatoes.com/article/t...,2022
4,100 Best Christmas Movies of All Time – Class...,https://editorial.rottentomatoes.com/guide/bes...,2022
...,...,...,...
418,Venice Film Festival 2017 Scorecard,https://editorial.rottentomatoes.com/guide/ven...,2017
419,Sundance 2017 Tomatometer Scores,https://editorial.rottentomatoes.com/guide/sun...,2017
420,Holiday Movie Scorecard 2017,https://editorial.rottentomatoes.com/guide/hol...,2017
421,Sundance 2016 Tomatometer Scores,https://editorial.rottentomatoes.com/guide/sun...,2016


# Keep only Movie Articles

In [198]:
mv_countdown_lists = countdown_lists[countdown_lists.title.str.contains('Movie')].reset_index(drop=True).copy()
mv_countdown_lists

Unnamed: 0,title,link,year
0,24 Batman Animated Movies Ranked,https://editorial.rottentomatoes.com/guide/bat...,2022
1,All 30 Marvel Movies Ranked: See MCU Movies B...,https://editorial.rottentomatoes.com/guide/all...,2022
2,Every Steven Spielberg Movie Ranked by Tomato...,https://editorial.rottentomatoes.com/guide/eve...,2022
3,100 Best Christmas Movies of All Time – Class...,https://editorial.rottentomatoes.com/guide/bes...,2022
4,Daniel Day-Lewis Movies Ranked by Tomatometer,https://editorial.rottentomatoes.com/guide/dan...,2022
...,...,...,...
379,100 Essential Action Movies,https://editorial.rottentomatoes.com/guide/140...,2017
380,36 Essential Fresh Holiday Family Movies,https://editorial.rottentomatoes.com/guide/36-...,2017
381,Scary Movies of 2017 by Tomatometer,https://editorial.rottentomatoes.com/guide/sca...,2017
382,Best Horror Movies by Year Since 1920,https://editorial.rottentomatoes.com/guide/bes...,2017


# Scrape Movie Links from Articles

## Sample Dictionary Output

In [144]:
get_movies('https://editorial.rottentomatoes.com/guide/worst-remakes-of-all-time/')

{'One Missed Call': 'https://www.rottentomatoes.com/m/one_missed_call',
 'Cabin Fever': 'https://www.rottentomatoes.com/m/cabin_fever_2016',
 'Kite': 'https://www.rottentomatoes.com/m/kite_2014',
 'Rollerball': 'https://www.rottentomatoes.com/m/rollerball_2002',
 'Flatliners': 'https://www.rottentomatoes.com/m/flatliners_2017',
 'The Fog': 'https://www.rottentomatoes.com/m/fog',
 "Jacob's Ladder": 'https://www.rottentomatoes.com/m/jacobs_ladder_2019',
 'Swept Away': 'https://www.rottentomatoes.com/m/1116774-swept_away',
 'Yours, Mine & Ours': 'https://www.rottentomatoes.com/m/yours_mine_and_ours',
 'Taxi': 'https://www.rottentomatoes.com/m/taxi',
 'Bangkok Dangerous': 'https://www.rottentomatoes.com/m/bangkok_dangerous',
 'When a Stranger Calls': 'https://www.rottentomatoes.com/m/when_a_stranger_calls',
 'Shutter': 'https://www.rottentomatoes.com/m/10009254-shutter',
 'Martyrs': 'https://www.rottentomatoes.com/m/martyrs_2016',
 'Point Break': 'https://www.rottentomatoes.com/m/point_bre

## Store Link Dictionaries in the Dataframes

In [191]:
all_time_dicts = [get_movies(x) for x in tqdm(all_time_lists['link'])]
all_time_lists['movies'] = all_time_dicts
all_time_lists

100%|██████████| 21/21 [02:04<00:00,  5.91s/it]


In [199]:
mv_countdown_dicts = [get_movies(x) for x in tqdm(mv_countdown_lists['link'])]
mv_countdown_lists['movies'] = mv_countdown_dicts

mv_countdown_lists

100%|██████████| 384/384 [55:42<00:00,  8.70s/it]  


Unnamed: 0,title,link,year,movies
0,24 Batman Animated Movies Ranked,https://editorial.rottentomatoes.com/guide/bat...,2022,{'The LEGO Batman Movie': 'https://www.rottent...
1,All 30 Marvel Movies Ranked: See MCU Movies B...,https://editorial.rottentomatoes.com/guide/all...,2022,{'Black Panther': 'https://www.rottentomatoes....
2,Every Steven Spielberg Movie Ranked by Tomato...,https://editorial.rottentomatoes.com/guide/eve...,2022,{'E.T. the Extra-Terrestrial': 'https://www.ro...
3,100 Best Christmas Movies of All Time – Class...,https://editorial.rottentomatoes.com/guide/bes...,2022,{'The Shop Around the Corner': 'https://www.ro...
4,Daniel Day-Lewis Movies Ranked by Tomatometer,https://editorial.rottentomatoes.com/guide/dan...,2022,{'My Left Foot': 'https://www.rottentomatoes.c...
...,...,...,...,...
379,100 Essential Action Movies,https://editorial.rottentomatoes.com/guide/140...,2017,{'Running Scared': 'https://www.rottentomatoes...
380,36 Essential Fresh Holiday Family Movies,https://editorial.rottentomatoes.com/guide/36-...,2017,{'Mickey's Christmas Carol': 'https://www.rott...
381,Scary Movies of 2017 by Tomatometer,https://editorial.rottentomatoes.com/guide/sca...,2017,{'Creep 2': 'https://www.rottentomatoes.com/m/...
382,Best Horror Movies by Year Since 1920,https://editorial.rottentomatoes.com/guide/bes...,2017,{'The Cabinet of Dr. Caligari': 'https://www.r...


## Fill in missing value due to website typo

In [None]:
missing_dict = get_movies(mv_countdown_lists.loc[285,'link'][7:])
mv_countdown_lists.at[285, 'movies'] = missing_dict

# Save DataFrames to Pickle Files

In [202]:
all_time_lists.to_pickle("/mnt/processed/private/msds2023/lt2/Lab2/All_Time_Lists.pkl")

In [247]:
mv_countdown_lists.to_pickle("/mnt/processed/private/msds2023/lt2/Lab2/Countdown_Lists.pkl")

In [250]:
pd.read_pickle("/mnt/processed/private/msds2023/lt2/Lab2/Countdown_Lists.pkl").loc[285,:]

title         All Wes Anderson Movies Ranked by Tomatometer
link      http://https://editorialadmin.rottentomatoes.c...
year                                                   2020
movies    {'Moonrise Kingdom': 'https://www.rottentomato...
Name: 285, dtype: object

In [231]:
mv_countdown_lists.movies.apply(len).sum()

15749

In [249]:
!pwd

/mnt/processed/private/msds2023/lt2/Lab2
