# Web Scraping Using Regex

In [1]:
# Evelyn Lee
# April 17, 2022

# This program retrieve the top 100 movies and their information in 2012 from 
# Metacritic using regular expressions and print out the result as DataFrame.

In [2]:
# get data using url from Metacritic webpage

import urllib3
import certifi
# set url
url = 'https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2012&sort=desc&view=detailed'
# create connection with python request pool to make request
# add certificate verification using certifi
http = urllib3.PoolManager(ca_certs=certifi.where())
# use GET to request and retrieve data from the server and send specified user agent to the client
r = http.request('GET', url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)'})
# convert reuslt from raw bytes to text
datastring = str(r.data, 'utf-8')
# check whether it succeeded and how much data got
print(f'Fetched {len(r.data)} bytes from {url}. Status: {r.status}')

Fetched 520965 bytes from https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=2012&sort=desc&view=detailed. Status: 200


In [3]:
# extract data using regular expressions and add it to a dictionary

import re
# list of regex codes
reg_code = ['<a href="\/movie\/.*" class="title"><h3>(.*)<\/h3><\/a>', '<span>(.*20.*)<', '<div class="summary">\s*([\s\S]*?)\s*<\/div>',
    'n>\s*.*\s<div class="metascore_w large movie positive">(.*)<','<a href="\/movie\/.*"><img src="(.*)" alt=".*"']
# list of column names
columns_name = ['Title','Release Date','Description','Metascore','Thumbnail']
# initialize a list to add all the data in sublist
lst = []

# iterate regex code
for i in reg_code:
    data = re.findall(i, datastring)
    # add to list
    lst.append(data)
    
# print out how many movie data were captured for each criteria
#for j in range(len(lst)):
#    print(f'Top {len(lst[j][:])} movie {columns_name[j]} were found for the year 2012.')

# dictionary of column names : data
dict = {column: lst[i] for i, column in enumerate(columns_name)}
#print(dict)

In [4]:
# print the retrieved data as DataFrame

import pandas as pd
# this allows to transfer data from dictionary to create a dataFrame 
df = pd.DataFrame.from_dict(dict)
# set index number to start with 1
df.index += 1
# set movie titles as index
df = df.set_index('Title')
df

Unnamed: 0_level_0,Release Date,Description,Metascore,Thumbnail
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Children of Paradise (1945),"March 9, 2012",Children of Paradise is the tale of a woman lo...,96,https://static.metacritic.com/images/products/...
Zero Dark Thirty,"December 19, 2012","For a decade, an elite team of intelligence an...",95,https://static.metacritic.com/images/products/...
Amour,"December 19, 2012",Georges and Anne are in their eighties. They a...,94,https://static.metacritic.com/images/products/...
It's Such a Beautiful Day,"October 5, 2012",Bill struggles to put together his shattered p...,90,https://static.metacritic.com/images/products/...
This Is Not a Film,"March 2, 2012","This clandestine documentary, shot partially o...",90,https://static.metacritic.com/images/products/...
...,...,...,...,...
Mea Maxima Culpa: Silence in the House of God,"November 16, 2012",Alex Gibney examines the charged issue of pedo...,73,https://static.metacritic.com/images/products/...
A Royal Affair,"November 9, 2012",A Royal Affair is the true story of an ordinar...,73,https://static.metacritic.com/images/products/...
Attenberg,"March 9, 2012","Part of the new wave of Greek cinema, Attenber...",73,https://static.metacritic.com/images/products/...
Knuckleball!,"September 21, 2012",The film follows the Major League’s only knuck...,73,https://static.metacritic.com/images/products/...
