# Set-up

In [31]:
# load packages
import requests
from bs4 import BeautifulSoup

In [32]:
# Give the URL of the site. For Pandas DF and csv output, some url's can give better results
base_site = "https://editorial.rottentomatoes.com/guide/best-horror-movies-of-all-time/"

In [33]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [34]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [35]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [36]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

### lxml

In [37]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [38]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

### A word of caution

## Finding an element containing all the data

In [39]:
# Finding all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors">A Nightmare on Elm Street 3: Dream Warriors</a> <span class="subtle start-year">(1987)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">72%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#200</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>75042% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the n

# Extracting the title, year and score of each movie

In [40]:

divs[0].find("h2")

<h2><a href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors">A Nightmare on Elm Street 3: Dream Warriors</a> <span class="subtle start-year">(1987)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">72%</span></h2>

In [41]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors">A Nightmare on Elm Street 3: Dream Warriors</a> <span class="subtle start-year">(1987)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">72%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/phenomena">Creepers</a> <span class="subtle start-year">(1985)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">76%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/bram_stokers_dracula">Bram Stoker's Dracula</a> <span class="subtle start-year">(1992)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">75%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/hellraiser">Hellraiser</a> <span class="subtle start-year">(1987)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">72%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/1010793-it

In [42]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['A Nightmare on Elm Street 3: Dream Warriors (1987)  72%',
 'Creepers (1985)  76%',
 "Bram Stoker's Dracula (1992)  75%",
 'Hellraiser (1987)  72%',
 "It's Alive! (1974)  70%",
 "Jacob's Ladder (1990)  73%",
 'Open Water (2003)  71%',
 'The Mist (2007)  71%',
 'The Ring (2002)  71%',
 'Phantasm (1979)  74%',
 'Frailty (2002)  75%',
 'Dog Soldiers (2002)  79%',
 'Basket Case (1982)  76%',
 'Eden Lake (2008)  80%',
 'Candyman (1992)  77%',
 'Oculus (2013)  74%',
 'Land of the Dead (2005)  74%',
 'Night of the Comet (1984)  79%',
 "Wes Craven's New Nightmare (1994)  79%",
 "Trick 'r Treat (2007)  84%",
 'The Lost Boys (1987)  76%',
 'The Lodge (2019)  74%',
 'Scream (1996)  79%',
 'Southbound (2015)  81%',
 'Lights Out (2016)  76%',
 'The Platform (2019)  79%',
 'The Brood (1979)  83%',
 "The Cat o' Nine Tails (1971)  82%",
 'Mute Witness (1995)  83%',
 "You're Next (2011)  79%",
 'Videodrome (1983)  78%',
 'Thirst (2009)  80%',
 'Audition (1999)  82%',
 'Three...Extremes (Saam gaang yi)

In [43]:
# It does contain the info we want to extract
# However, we need to obtain the title, year and score separately
# Let's inspect one heading to see if there is a way to distinguish between them
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors">A Nightmare on Elm Street 3: Dream Warriors</a> <span class="subtle start-year">(1987)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">72%</span></h2>

In [44]:
# We notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [45]:
# Let's check all heading links
[heading.find('a') for heading in headings]

[<a href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors">A Nightmare on Elm Street 3: Dream Warriors</a>,
 <a href="https://www.rottentomatoes.com/m/phenomena">Creepers</a>,
 <a href="https://www.rottentomatoes.com/m/bram_stokers_dracula">Bram Stoker's Dracula</a>,
 <a href="https://www.rottentomatoes.com/m/hellraiser">Hellraiser</a>,
 <a href="https://www.rottentomatoes.com/m/1010793-its_alive">It's Alive!</a>,
 <a href="https://www.rottentomatoes.com/m/jacobs_ladder">Jacob's Ladder</a>,
 <a href="https://www.rottentomatoes.com/m/open_water">Open Water</a>,
 <a href="https://www.rottentomatoes.com/m/mist">The Mist</a>,
 <a href="https://www.rottentomatoes.com/m/ring">The Ring</a>,
 <a href="https://www.rottentomatoes.com/m/phantasm">Phantasm</a>,
 <a href="https://www.rottentomatoes.com/m/frailty">Frailty</a>,
 <a href="https://www.rottentomatoes.com/m/dog_soldiers">Dog Soldiers</a>,
 <a href="https://www.rottentomatoes.com/m/basket_case">Basket Case</a>,

In [46]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

['A Nightmare on Elm Street 3: Dream Warriors',
 'Creepers',
 "Bram Stoker's Dracula",
 'Hellraiser',
 "It's Alive!",
 "Jacob's Ladder",
 'Open Water',
 'The Mist',
 'The Ring',
 'Phantasm',
 'Frailty',
 'Dog Soldiers',
 'Basket Case',
 'Eden Lake',
 'Candyman',
 'Oculus',
 'Land of the Dead',
 'Night of the Comet',
 "Wes Craven's New Nightmare",
 "Trick 'r Treat",
 'The Lost Boys',
 'The Lodge',
 'Scream',
 'Southbound',
 'Lights Out',
 'The Platform',
 'The Brood',
 "The Cat o' Nine Tails",
 'Mute Witness',
 "You're Next",
 'Videodrome',
 'Thirst',
 'Audition',
 'Three...Extremes (Saam gaang yi)',
 'Dead Ringers',
 'Scream 2',
 '1408',
 'A Tale of Two Sisters',
 'Ouija: Origin of Evil',
 'Pontypool',
 'The Shallows',
 'Shadow of the Vampire',
 'The House of the Devil',
 'Tucker & Dale vs Evil',
 "Dracula: Pages From a Virgin's Diary",
 'The Others',
 'Creature From the Black Lagoon',
 'Tremors',
 'Creep',
 'Carnival of Souls',
 'Rec',
 'Paranormal Activity',
 'The Conjuring 2',
 'Gin

## Year

In [47]:
# Filtering only the spans containing the year
[heading.find("span", class_ = 'start-year') for heading in headings]

[<span class="subtle start-year">(1987)</span>,
 <span class="subtle start-year">(1985)</span>,
 <span class="subtle start-year">(1992)</span>,
 <span class="subtle start-year">(1987)</span>,
 <span class="subtle start-year">(1974)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(2003)</span>,
 <span class="subtle start-year">(2007)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(1979)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(1982)</span>,
 <span class="subtle start-year">(2008)</span>,
 <span class="subtle start-year">(1992)</span>,
 <span class="subtle start-year">(2013)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1984)</span>,
 <span class="subtle start-year">(1994)</span>,
 <span class="subtle start-year">(2007)</span>,
 <span class="subtle start-year">(1987)<

In [48]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]
years

['(1987)',
 '(1985)',
 '(1992)',
 '(1987)',
 '(1974)',
 '(1990)',
 '(2003)',
 '(2007)',
 '(2002)',
 '(1979)',
 '(2002)',
 '(2002)',
 '(1982)',
 '(2008)',
 '(1992)',
 '(2013)',
 '(2005)',
 '(1984)',
 '(1994)',
 '(2007)',
 '(1987)',
 '(2019)',
 '(1996)',
 '(2015)',
 '(2016)',
 '(2019)',
 '(1979)',
 '(1971)',
 '(1995)',
 '(2011)',
 '(1983)',
 '(2009)',
 '(1999)',
 '(2005)',
 '(1988)',
 '(1997)',
 '(2007)',
 '(2003)',
 '(2016)',
 '(2008)',
 '(2016)',
 '(2000)',
 '(2009)',
 '(2010)',
 '(2002)',
 '(2001)',
 '(1954)',
 '(1990)',
 '(2014)',
 '(1962)',
 '(2007)',
 '(2007)',
 '(2016)',
 '(2000)',
 '(2014)',
 '(2010)',
 '(1973)',
 '(2017)',
 '(2006)',
 '(1959)',
 '(1982)',
 '(2016)',
 '(1982)',
 '(1992)',
 '(2018)',
 '(1976)',
 '(2005)',
 '(2016)',
 '(1981)',
 '(2019)',
 '(1976)',
 '(2016)',
 '(2019)',
 '(2019)',
 '(1973)',
 '(2015)',
 '(2016)',
 '(2017)',
 '(1971)',
 '(1978)',
 '(1977)',
 '(1986)',
 '(2010)',
 '(2021)',
 '(2007)',
 '(1983)',
 '(2015)',
 '(1971)',
 '(1960)',
 '(1999)',
 '(2015)',

In [49]:
years[0]

'(1987)'

### Removing the brackets

In [50]:
# One way to remove the brackets is to drop the first and last symbol of the string
years[0][1:-1]

'1987'

In [51]:
# However, this will break, if the format of the year is changed

In [52]:
# Alternativelly, we can do it with the help of the strip() method (this is robust)

# It removes leading and trailing symbols from a string
# By default, it removes whitespace, but we can specify other symbols to strip

In [53]:
# Removing '('
years[0].strip('(')

'1987)'

In [54]:
# Removing ')'
years[0].strip(')')

'(1987'

In [55]:
# Combining both
years[0].strip('()')

'1987'

In [56]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

['1987',
 '1985',
 '1992',
 '1987',
 '1974',
 '1990',
 '2003',
 '2007',
 '2002',
 '1979',
 '2002',
 '2002',
 '1982',
 '2008',
 '1992',
 '2013',
 '2005',
 '1984',
 '1994',
 '2007',
 '1987',
 '2019',
 '1996',
 '2015',
 '2016',
 '2019',
 '1979',
 '1971',
 '1995',
 '2011',
 '1983',
 '2009',
 '1999',
 '2005',
 '1988',
 '1997',
 '2007',
 '2003',
 '2016',
 '2008',
 '2016',
 '2000',
 '2009',
 '2010',
 '2002',
 '2001',
 '1954',
 '1990',
 '2014',
 '1962',
 '2007',
 '2007',
 '2016',
 '2000',
 '2014',
 '2010',
 '1973',
 '2017',
 '2006',
 '1959',
 '1982',
 '2016',
 '1982',
 '1992',
 '2018',
 '1976',
 '2005',
 '2016',
 '1981',
 '2019',
 '1976',
 '2016',
 '2019',
 '2019',
 '1973',
 '2015',
 '2016',
 '2017',
 '1971',
 '1978',
 '1977',
 '1986',
 '2010',
 '2021',
 '2007',
 '1983',
 '2015',
 '1971',
 '1960',
 '1999',
 '2015',
 '2017',
 '2002',
 '2016',
 '2015',
 '1958',
 '1985',
 '1973',
 '2015',
 '1974',
 '2013',
 '1990',
 '1960',
 '2019',
 '2010',
 '2018',
 '1973',
 '2011',
 '1977',
 '2001',
 '2014',
 

In [57]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[1987,
 1985,
 1992,
 1987,
 1974,
 1990,
 2003,
 2007,
 2002,
 1979,
 2002,
 2002,
 1982,
 2008,
 1992,
 2013,
 2005,
 1984,
 1994,
 2007,
 1987,
 2019,
 1996,
 2015,
 2016,
 2019,
 1979,
 1971,
 1995,
 2011,
 1983,
 2009,
 1999,
 2005,
 1988,
 1997,
 2007,
 2003,
 2016,
 2008,
 2016,
 2000,
 2009,
 2010,
 2002,
 2001,
 1954,
 1990,
 2014,
 1962,
 2007,
 2007,
 2016,
 2000,
 2014,
 2010,
 1973,
 2017,
 2006,
 1959,
 1982,
 2016,
 1982,
 1992,
 2018,
 1976,
 2005,
 2016,
 1981,
 2019,
 1976,
 2016,
 2019,
 2019,
 1973,
 2015,
 2016,
 2017,
 1971,
 1978,
 1977,
 1986,
 2010,
 2021,
 2007,
 1983,
 2015,
 1971,
 1960,
 1999,
 2015,
 2017,
 2002,
 2016,
 2015,
 1958,
 1985,
 1973,
 2015,
 1974,
 2013,
 1990,
 1960,
 2019,
 2010,
 2018,
 1973,
 2011,
 1977,
 2001,
 2014,
 2015,
 1970,
 1925,
 1941,
 1978,
 1931,
 2004,
 2017,
 2016,
 1985,
 2009,
 2017,
 1980,
 1931,
 1975,
 1986,
 1984,
 1932,
 2006,
 1927,
 2016,
 1933,
 2017,
 1977,
 1998,
 1943,
 2016,
 2019,
 1978,
 1961,
 2009,
 1987,

## Score

In [58]:

# Filtering only the spans containing the score
[heading.find("span", class_ = 'tMeterScore') for heading in headings]

[<span class="tMeterScore">72%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore">75%</span>,
 <span class="tMeterScore">72%</span>,
 <span class="tMeterScore">70%</span>,
 <span class="tMeterScore">73%</span>,
 <span class="tMeterScore">71%</span>,
 <span class="tMeterScore">71%</span>,
 <span class="tMeterScore">71%</span>,
 <span class="tMeterScore">74%</span>,
 <span class="tMeterScore">75%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore">80%</span>,
 <span class="tMeterScore">77%</span>,
 <span class="tMeterScore">74%</span>,
 <span class="tMeterScore">74%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">84%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore">74%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">81%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore

In [59]:
# Extracting the score string
scores = [heading.find("span", class_ = 'tMeterScore').string for heading in headings]
scores

['72%',
 '76%',
 '75%',
 '72%',
 '70%',
 '73%',
 '71%',
 '71%',
 '71%',
 '74%',
 '75%',
 '79%',
 '76%',
 '80%',
 '77%',
 '74%',
 '74%',
 '79%',
 '79%',
 '84%',
 '76%',
 '74%',
 '79%',
 '81%',
 '76%',
 '79%',
 '83%',
 '82%',
 '83%',
 '79%',
 '78%',
 '80%',
 '82%',
 '84%',
 '83%',
 '81%',
 '79%',
 '85%',
 '82%',
 '84%',
 '78%',
 '82%',
 '85%',
 '85%',
 '87%',
 '83%',
 '79%',
 '86%',
 '90%',
 '86%',
 '89%',
 '83%',
 '80%',
 '90%',
 '85%',
 '90%',
 '87%',
 '88%',
 '87%',
 '89%',
 '86%',
 '85%',
 '86%',
 '88%',
 '88%',
 '88%',
 '86%',
 '89%',
 '88%',
 '78%',
 '86%',
 '90%',
 '86%',
 '83%',
 '83%',
 '89%',
 '88%',
 '91%',
 '88%',
 '90%',
 '91%',
 '87%',
 '90%',
 '84%',
 '87%',
 '90%',
 '91%',
 '88%',
 '86%',
 '86%',
 '92%',
 '91%',
 '87%',
 '91%',
 '92%',
 '90%',
 '92%',
 '88%',
 '94%',
 '89%',
 '86%',
 '90%',
 '93%',
 '86%',
 '88%',
 '79%',
 '94%',
 '90%',
 '90%',
 '92%',
 '95%',
 '95%',
 '93%',
 '90%',
 '90%',
 '94%',
 '90%',
 '92%',
 '92%',
 '88%',
 '93%',
 '89%',
 '87%',
 '84%',
 '94%',


In [60]:
# Removing the '%' sign
scores = [s.strip('%') for s in scores]
scores

['72',
 '76',
 '75',
 '72',
 '70',
 '73',
 '71',
 '71',
 '71',
 '74',
 '75',
 '79',
 '76',
 '80',
 '77',
 '74',
 '74',
 '79',
 '79',
 '84',
 '76',
 '74',
 '79',
 '81',
 '76',
 '79',
 '83',
 '82',
 '83',
 '79',
 '78',
 '80',
 '82',
 '84',
 '83',
 '81',
 '79',
 '85',
 '82',
 '84',
 '78',
 '82',
 '85',
 '85',
 '87',
 '83',
 '79',
 '86',
 '90',
 '86',
 '89',
 '83',
 '80',
 '90',
 '85',
 '90',
 '87',
 '88',
 '87',
 '89',
 '86',
 '85',
 '86',
 '88',
 '88',
 '88',
 '86',
 '89',
 '88',
 '78',
 '86',
 '90',
 '86',
 '83',
 '83',
 '89',
 '88',
 '91',
 '88',
 '90',
 '91',
 '87',
 '90',
 '84',
 '87',
 '90',
 '91',
 '88',
 '86',
 '86',
 '92',
 '91',
 '87',
 '91',
 '92',
 '90',
 '92',
 '88',
 '94',
 '89',
 '86',
 '90',
 '93',
 '86',
 '88',
 '79',
 '94',
 '90',
 '90',
 '92',
 '95',
 '95',
 '93',
 '90',
 '90',
 '94',
 '90',
 '92',
 '92',
 '88',
 '93',
 '89',
 '87',
 '84',
 '94',
 '96',
 '93',
 '95',
 '88',
 '93',
 '96',
 '94',
 '88',
 '93',
 '93',
 '97',
 '85',
 '95',
 '97',
 '92',
 '95',
 '98',
 '95',

In [61]:
# Converting each score to an integer
scores = [int(s) for s in scores]
scores

[72,
 76,
 75,
 72,
 70,
 73,
 71,
 71,
 71,
 74,
 75,
 79,
 76,
 80,
 77,
 74,
 74,
 79,
 79,
 84,
 76,
 74,
 79,
 81,
 76,
 79,
 83,
 82,
 83,
 79,
 78,
 80,
 82,
 84,
 83,
 81,
 79,
 85,
 82,
 84,
 78,
 82,
 85,
 85,
 87,
 83,
 79,
 86,
 90,
 86,
 89,
 83,
 80,
 90,
 85,
 90,
 87,
 88,
 87,
 89,
 86,
 85,
 86,
 88,
 88,
 88,
 86,
 89,
 88,
 78,
 86,
 90,
 86,
 83,
 83,
 89,
 88,
 91,
 88,
 90,
 91,
 87,
 90,
 84,
 87,
 90,
 91,
 88,
 86,
 86,
 92,
 91,
 87,
 91,
 92,
 90,
 92,
 88,
 94,
 89,
 86,
 90,
 93,
 86,
 88,
 79,
 94,
 90,
 90,
 92,
 95,
 95,
 93,
 90,
 90,
 94,
 90,
 92,
 92,
 88,
 93,
 89,
 87,
 84,
 94,
 96,
 93,
 95,
 88,
 93,
 96,
 94,
 88,
 93,
 93,
 97,
 85,
 95,
 97,
 92,
 95,
 98,
 95,
 92,
 95,
 96,
 95,
 100,
 90,
 93,
 83,
 95,
 95,
 92,
 96,
 100,
 88,
 97,
 90,
 86,
 95,
 91,
 95,
 99,
 93,
 99,
 100,
 90,
 96,
 90,
 92,
 95,
 98,
 96,
 96,
 96,
 94,
 97,
 96,
 95,
 96,
 98,
 89,
 97,
 98,
 90,
 100,
 98,
 98,
 98,
 96,
 92,
 95,
 97,
 98,
 98,
 93,
 99,
 98,
 

# Extracting the rest of the information

## Critics Consensus

In [62]:
# Getting the 'div' tags containing the critics consensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]
consensus

[<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> <em>A Nightmare on Elm Street 3: Dream Warriors</em> offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> No consensus yet.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> No consensus yet.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> No consensus yet.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Tough and unpleasant, <em>It's Alive</em> throttles the viewer with its bizarre mutant baby theatrics.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Even with its disorienting leaps of logic and structure, <em>Jacob's Ladder</em> is an engrossing, nerve-shatt

In [63]:
# Inspecting the text inside these tags
[con.text for con in consensus]

['Critics Consensus: A Nightmare on Elm Street 3: Dream Warriors offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.',
 'Critics Consensus: No consensus yet.',
 'Critics Consensus: No consensus yet.',
 'Critics Consensus: No consensus yet.',
 "Critics Consensus: Tough and unpleasant, It's Alive throttles the viewer with its bizarre mutant baby theatrics.",
 "Critics Consensus: Even with its disorienting leaps of logic and structure, Jacob's Ladder is an engrossing, nerve-shattering experience.",
 'Critics Consensus: A low budget thriller with some intense moments.',
 'Critics Consensus: No consensus yet.',
 "Critics Consensus: With little gore and a lot of creepy visuals, The Ring gets under your skin, thanks to director Gore Verbinski's haunting sense of atmosphere and an impassioned performance from Naomi Watts.",
 'Critics Consensus: No consensus yet.',
 'Critics Consensus: No consensus yet.',
 'Critics Consensus: No c

### Way #1: Text processing

In [64]:

common_phrase = 'Critics Consensus: '

In [65]:

len(common_phrase)

19

In [66]:
consensus[0].text

'Critics Consensus: A Nightmare on Elm Street 3: Dream Warriors offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.'

In [67]:

consensus[0].text[19:]

'A Nightmare on Elm Street 3: Dream Warriors offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.'

In [68]:

common_len = len(common_phrase)

In [69]:
# Cleaning the list of the common phrase
consensus_text = [con.text[common_len:] for con in consensus]
consensus_text

['A Nightmare on Elm Street 3: Dream Warriors offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 "Tough and unpleasant, It's Alive throttles the viewer with its bizarre mutant baby theatrics.",
 "Even with its disorienting leaps of logic and structure, Jacob's Ladder is an engrossing, nerve-shattering experience.",
 'A low budget thriller with some intense moments.',
 'No consensus yet.',
 "With little gore and a lot of creepy visuals, The Ring gets under your skin, thanks to director Gore Verbinski's haunting sense of atmosphere and an impassioned performance from Naomi Watts.",
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'A brutal and effective British hoodie-horror that, despite the clichÃ©s, stays on the right side of scary.',
 'No consensus yet.',
 'With an emphasis on dread over gore and an ending that 

In [70]:
# We can add if-else logic to only truncate the string in case it starts with the common phrase
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus ]
consensus_text

['A Nightmare on Elm Street 3: Dream Warriors offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 "Tough and unpleasant, It's Alive throttles the viewer with its bizarre mutant baby theatrics.",
 "Even with its disorienting leaps of logic and structure, Jacob's Ladder is an engrossing, nerve-shattering experience.",
 'A low budget thriller with some intense moments.',
 'No consensus yet.',
 "With little gore and a lot of creepy visuals, The Ring gets under your skin, thanks to director Gore Verbinski's haunting sense of atmosphere and an impassioned performance from Naomi Watts.",
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'A brutal and effective British hoodie-horror that, despite the clichÃ©s, stays on the right side of scary.',
 'No consensus yet.',
 'With an emphasis on dread over gore and an ending that 

### Way #2: Inspecting the HTML

In [71]:
consensus[0]

<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> <em>A Nightmare on Elm Street 3: Dream Warriors</em> offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.</div>

In [72]:
# When inspecting the HTML we see that the common phrase ("Critics Consensus: ")
# is located inside a span element
# The string we want to obtain follows that

In [73]:
# We can use .contents to obtain a list of all children of the tag
consensus[0].contents

[<span class="descriptor">Critics Consensus:</span>,
 ' ',
 <em>A Nightmare on Elm Street 3: Dream Warriors</em>,
 ' offers an imaginative and surprisingly satisfying rebound for a franchise already starting to succumb to sequelitis.']

In [74]:
# The second element of that list is the text we want
consensus[0].contents[1]

' '

In [75]:
# We can remove the extra whitespace (space at the beginning) with the .strip() method
consensus[0].contents[1].strip()

''

In [76]:
# Processing all texts
consensus_text = [con.contents[1].strip() for con in consensus]
consensus_text

['',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'Tough and unpleasant,',
 'Even with its disorienting leaps of logic and structure,',
 'A low budget thriller with some intense moments.',
 'No consensus yet.',
 'With little gore and a lot of creepy visuals,',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'No consensus yet.',
 'A brutal and effective British hoodie-horror that, despite the clichÃ©s, stays on the right side of scary.',
 'No consensus yet.',
 'With an emphasis on dread over gore and an ending that leaves the door wide open for sequels,',
 "George A. Romero's latest entry in his much-vaunted Dead series is not as fresh as his genre-inventing original, Night of the Living Dead. But Land of the Dead does deliver on the gore and zombies-feasting-on-flesh action.",
 'Valley Girl culture satire',
 '',
 'A deftly crafted tribute to Halloween legends,',
 'No consensus yet.',
 'No consensus yet.',
 "Horror icon Wes Craven's subversive de

In [77]:
# In my opinion, this method is closer to the BeautifulSoup approach

## Directors

In [78]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]
directors

[<div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/chuck_russell">Chuck Russell</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/dario_argento">Dario Argento</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/francis_ford_coppola">Francis Ford Coppola</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/clive_barker">Clive Barker</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/lawrence_g_cohen">Larry Cohen</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/adrian_lyne">Adrian

In [79]:
# Inspecting a div
directors[0]

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/chuck_russell">Chuck Russell</a></div>

In [80]:
# The director's name can be found as the string of a link

# Obtaining all director links
[director.find("a") for director in directors]

[<a class="" href="//www.rottentomatoes.com/celebrity/chuck_russell">Chuck Russell</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/dario_argento">Dario Argento</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/francis_ford_coppola">Francis Ford Coppola</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/clive_barker">Clive Barker</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/lawrence_g_cohen">Larry Cohen</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/adrian_lyne">Adrian Lyne</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/chris_kentis">Chris Kentis</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/frank_darabont">Frank Darabont</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/gore_verbinski">Gore Verbinski</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/don_coscarelli_2">Don Coscarelli</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/bill_paxton">Bill Paxton</a>,
 <a class="" href="//

In [81]:
# Notice that one link is None - the director of Iron Man is missing!

# This means we can't simply use .string,
# because None has no string attribute

In [82]:
# Running the line below will raise an error if uncommented

#[director.find("a").string for director in directors]

In [83]:
# We can use if-else to deal with the None value

final_directors = [None if director.find("a") is None else director.find("a").string for director in directors]
final_directors

['Chuck Russell',
 'Dario Argento',
 'Francis Ford Coppola',
 'Clive Barker',
 'Larry Cohen',
 'Adrian Lyne',
 'Chris Kentis',
 'Frank Darabont',
 'Gore Verbinski',
 'Don Coscarelli',
 'Bill Paxton',
 'Neil Marshall',
 'Frank Henenlotter',
 'James Watkins',
 'Bernard Rose',
 'Mike Flanagan',
 'George Romero',
 'Thom Eberhardt',
 'Wes Craven',
 'Michael Dougherty',
 'Joel Schumacher',
 'Veronika Franz',
 'Wes Craven',
 'Roxanne Benjamin',
 'David F. Sandberg',
 'Galder Gaztelu-Urrutia',
 'David Cronenberg',
 'Dario Argento',
 'Anthony Waller',
 'Adam Wingard',
 'David Cronenberg',
 'Park Chan-wook',
 'Takashi Miike',
 'Takashi Miike',
 'David Cronenberg',
 'Wes Craven',
 'Mikael Hafstrom',
 'Kim Jee-woon',
 'Mike Flanagan',
 'Bruce McDonald',
 'Jaume Collet-Serra',
 'E. Elias Merhige',
 'Ti West',
 'Eli Craig',
 'Guy Maddin',
 'Alejandro AmenÃ¡bar',
 'Jack Arnold',
 'Ron Underwood',
 'Patrick Brice',
 'Herk Harvey',
 'Jaume BalaguerÃ³',
 'Oren Peli',
 'James Wan',
 'John Fawcett',
 'Ver

## Cast info

In [84]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]
cast_info

[<div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/heather_langenkamp">Heather Langenkamp</a>, <a class="" href="//www.rottentomatoes.com/celebrity/patricia_arquette">Patricia Arquette</a>, <a class="" href="//www.rottentomatoes.com/celebrity/craig_wasson">Craig Wasson</a>, <a class="" href="//www.rottentomatoes.com/celebrity/larry_fishburne">Larry Fishburne</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/jennifer_connelly">Jennifer Connelly</a>, <a class="" href="//www.rottentomatoes.com/celebrity/donald_pleasence">Donald Pleasence</a>, <a class="" href="//www.rottentomatoes.com/celebrity/dalila_di_lazzaro">Dalila Di Lazzaro</a>, <a class="" href="//www.rottentomatoes.com/celebrity/fausta_avelli">Fausta Avelli</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/cel

In [85]:
cast_info[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/heather_langenkamp">Heather Langenkamp</a>, <a class="" href="//www.rottentomatoes.com/celebrity/patricia_arquette">Patricia Arquette</a>, <a class="" href="//www.rottentomatoes.com/celebrity/craig_wasson">Craig Wasson</a>, <a class="" href="//www.rottentomatoes.com/celebrity/larry_fishburne">Larry Fishburne</a></div>

In [86]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [87]:
# Let's first practice with a single movie

# Obtain all the links to different cast members
cast_links = cast_info[0].find_all('a')
cast_links

[<a class="" href="//www.rottentomatoes.com/celebrity/heather_langenkamp">Heather Langenkamp</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/patricia_arquette">Patricia Arquette</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/craig_wasson">Craig Wasson</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/larry_fishburne">Larry Fishburne</a>]

In [88]:
# Extract the names from the links
cast_names = [link.string for link in cast_links]
cast_names

['Heather Langenkamp', 'Patricia Arquette', 'Craig Wasson', 'Larry Fishburne']

In [89]:
# OPTIONALLY: We can stitch all names together as one string

# This can be done using the join method
# To use join, pick a string to use as a separator (in our case a comma, followed with a space) and
# pass the list of strings you want to merge to the join method

cast = ", ".join(cast_names)
cast

'Heather Langenkamp, Patricia Arquette, Craig Wasson, Larry Fishburne'

In [90]:
# Now we need to do the above operations for every movie

# We can either use a for loop (clearer), or
# use a nested list compehension (more concise)

### Using a for loop

In [91]:
# Initialize the list of all cast memners
cast = []

# Just put all previous operations inside a for loop
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) # Joining is optional

cast

['Heather Langenkamp, Patricia Arquette, Craig Wasson, Larry Fishburne',
 'Jennifer Connelly, Donald Pleasence, Dalila Di Lazzaro, Fausta Avelli',
 'Gary Oldman, Winona Ryder, Anthony Hopkins, Keanu Reeves',
 'Andrew Robinson, Clare Higgins, Ashley Laurence, Sean Chapman',
 'John P. Ryan, Sharon Farrell, Andrew Duggan, Guy Stockwell',
 'Tim Robbins, Elizabeth PeÃ±a, Danny Aiello, Matt Craven',
 'Blanchard Ryan, Daniel Travis, Saul Stein, Estelle Lau',
 'Thomas Jane, Marcia Gay Harden, Laurie Holden, Andre Braugher',
 'Naomi Watts, Martin Henderson, David Dorfman, Brian Cox',
 'Michael Baldwin, Bill Thornbury, Reggie Bannister, Kathy Lester',
 "Bill Paxton, Matthew McConaughey, Powers Boothe, Matt O'Leary",
 'Sean Pertwee, Kevin McKidd, Emma Cleasby, Liam Cunningham',
 'Kevin Van Hentenryck, Terri Susan Smith, Beverly Bonner',
 'Finn Atkins, Michael Fassbender, James Gandhi, Thomas Gill',
 'Virginia Madsen, Tony Todd, Xander Berkeley, Kasi Lemmons',
 'Karen Gillan, Brenton Thwaites, Kat

### Nested list comprehension

In [92]:
# As you can see this can be done in just one line using nested list comprehension
# However, the code is harded to understand

cast = [", ".join([link.string for link in c.find_all("a")]) for c in cast_info]
cast

['Heather Langenkamp, Patricia Arquette, Craig Wasson, Larry Fishburne',
 'Jennifer Connelly, Donald Pleasence, Dalila Di Lazzaro, Fausta Avelli',
 'Gary Oldman, Winona Ryder, Anthony Hopkins, Keanu Reeves',
 'Andrew Robinson, Clare Higgins, Ashley Laurence, Sean Chapman',
 'John P. Ryan, Sharon Farrell, Andrew Duggan, Guy Stockwell',
 'Tim Robbins, Elizabeth PeÃ±a, Danny Aiello, Matt Craven',
 'Blanchard Ryan, Daniel Travis, Saul Stein, Estelle Lau',
 'Thomas Jane, Marcia Gay Harden, Laurie Holden, Andre Braugher',
 'Naomi Watts, Martin Henderson, David Dorfman, Brian Cox',
 'Michael Baldwin, Bill Thornbury, Reggie Bannister, Kathy Lester',
 "Bill Paxton, Matthew McConaughey, Powers Boothe, Matt O'Leary",
 'Sean Pertwee, Kevin McKidd, Emma Cleasby, Liam Cunningham',
 'Kevin Van Hentenryck, Terri Susan Smith, Beverly Bonner',
 'Finn Atkins, Michael Fassbender, James Gandhi, Thomas Gill',
 'Virginia Madsen, Tony Todd, Xander Berkeley, Kasi Lemmons',
 'Karen Gillan, Brenton Thwaites, Kat

## Adjusted score

In [93]:


# The adjusted scores can be found in a div with class 'info countdown-adjusted-score'
adj_scores = [div.find("div", {"class": "info countdown-adjusted-score"}) for div in divs]
adj_scores

[<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>75042% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>77316% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>79872% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjus

In [94]:
# Inspecting an element
adj_scores[0]

<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>75042% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>

In [95]:
# By inspection we see that the string we are looking for is the second child of the 'div' tag
adj_scores[0].contents[1]  # Note the extra whitespace at the end

'75042% '

In [96]:
# Extracting the string (without '%' sign and extra space)
adj_scores_clean = [score.contents[1].strip('% ') for score in adj_scores]
adj_scores_clean

['75042',
 '77316',
 '79872',
 '75800',
 '70976',
 '77734',
 '77198',
 '76987',
 '77196',
 '76726',
 '78812',
 '78823',
 '77609',
 '80339',
 '83111',
 '80295',
 '80540',
 '80673',
 '81113',
 '83487',
 '81468',
 '84879',
 '83536',
 '83477',
 '85809',
 '83911',
 '84875',
 '82602',
 '82767',
 '84134',
 '82341',
 '84252',
 '85017',
 '84580',
 '85768',
 '85971',
 '86391',
 '86348',
 '89197',
 '86984',
 '91369',
 '86108',
 '88299',
 '88884',
 '88436',
 '89412',
 '83037',
 '88769',
 '89287',
 '86647',
 '89821',
 '91372',
 '94721',
 '90563',
 '90645',
 '90708',
 '89159',
 '91820',
 '91671',
 '90009',
 '90695',
 '93912',
 '92645',
 '91681',
 '92442',
 '89623',
 '92073',
 '93032',
 '91624',
 '97445',
 '92224',
 '93026',
 '91750',
 '95297',
 '91193',
 '94147',
 '94743',
 '92406',
 '90850',
 '92362',
 '92670',
 '88944',
 '92525',
 '102965',
 '94511',
 '93058',
 '93352',
 '94059',
 '89896',
 '92423',
 '95027',
 '96121',
 '94205',
 '95737',
 '94787',
 '94424',
 '94750',
 '93220',
 '95361',
 '95593',

In [97]:
# Converting the strings to numbers
final_adj = [float(score) for score in adj_scores_clean] # Note that this time the scores are float, not int!
final_adj

[75042.0,
 77316.0,
 79872.0,
 75800.0,
 70976.0,
 77734.0,
 77198.0,
 76987.0,
 77196.0,
 76726.0,
 78812.0,
 78823.0,
 77609.0,
 80339.0,
 83111.0,
 80295.0,
 80540.0,
 80673.0,
 81113.0,
 83487.0,
 81468.0,
 84879.0,
 83536.0,
 83477.0,
 85809.0,
 83911.0,
 84875.0,
 82602.0,
 82767.0,
 84134.0,
 82341.0,
 84252.0,
 85017.0,
 84580.0,
 85768.0,
 85971.0,
 86391.0,
 86348.0,
 89197.0,
 86984.0,
 91369.0,
 86108.0,
 88299.0,
 88884.0,
 88436.0,
 89412.0,
 83037.0,
 88769.0,
 89287.0,
 86647.0,
 89821.0,
 91372.0,
 94721.0,
 90563.0,
 90645.0,
 90708.0,
 89159.0,
 91820.0,
 91671.0,
 90009.0,
 90695.0,
 93912.0,
 92645.0,
 91681.0,
 92442.0,
 89623.0,
 92073.0,
 93032.0,
 91624.0,
 97445.0,
 92224.0,
 93026.0,
 91750.0,
 95297.0,
 91193.0,
 94147.0,
 94743.0,
 92406.0,
 90850.0,
 92362.0,
 92670.0,
 88944.0,
 92525.0,
 102965.0,
 94511.0,
 93058.0,
 93352.0,
 94059.0,
 89896.0,
 92423.0,
 95027.0,
 96121.0,
 94205.0,
 95737.0,
 94787.0,
 94424.0,
 94750.0,
 93220.0,
 95361.0,
 95593.0,

## Synopsis

In [98]:


# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> During a hallucinatory incident, young Kristen Parker (Patricia Arquette) has her wrists slashed by dream-stalking monster Freddy Krueger (Robert Englund)....<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> An American (Jennifer Connelly) at a Swiss finishing school calls on insects to help a paralyzed scientist (Donald Pleasence) fight...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/phenomena" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Adaptation of Bram Stoker's classic vampire novel. Gary Oldman plays Dracula whose lonely soul is determined to reunite with his...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/bram_stokers_dracula" target="_top">

In [99]:
# Inspecting the element
synopsis[0]

<div class="info synopsis"><span class="descriptor">Synopsis:</span> During a hallucinatory incident, young Kristen Parker (Patricia Arquette) has her wrists slashed by dream-stalking monster Freddy Krueger (Robert Englund)....<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/a_nightmare_on_elm_street_3_dream_warriors" target="_top"> [More]</a></div>

In [100]:
# The text is the second child
synopsis[0].contents[1]

' During a hallucinatory incident, young Kristen Parker (Patricia Arquette) has her wrists slashed by dream-stalking monster Freddy Krueger (Robert Englund)....'

In [101]:
# Extracting the text
synopsis_text = [syn.contents[1] for syn in synopsis]
synopsis_text

[' During a hallucinatory incident, young Kristen Parker (Patricia Arquette) has her wrists slashed by dream-stalking monster Freddy Krueger (Robert Englund)....',
 ' An American (Jennifer Connelly) at a Swiss finishing school calls on insects to help a paralyzed scientist (Donald Pleasence) fight...',
 " Adaptation of Bram Stoker's classic vampire novel. Gary Oldman plays Dracula whose lonely soul is determined to reunite with his...",
 ' Sexual deviant Frank (Sean Chapman) inadvertently opens a portal to hell when he tinkers with a box he bought while...',
 ' Leaving their son, Chris (Daniel Holzman), with a family friend (William Wellman Jr.), Frank (John P. Ryan) and Lenore Davis...',
 ' After returning home from the Vietnam War, veteran Jacob Singer (Tim Robbins) struggles to maintain his sanity. Plagued by hallucinations...',
 ' Daniel (Daniel Travis) and Susan (Blanchard Ryan) embark on a tropical vacation with their scuba-diving certifications in tow. During a...',
 ' After a p

# Representing the data in structured form

In [102]:

import pandas as pd

## Creating a Data Frame

In [103]:
# A dataframe is a tabular data type, frequently used in data science

movies_info = pd.DataFrame()
movies_info  # The dataframe is still empty, we need to fill it with the info we gathered

## Populating the dataframe

In [104]:
# Populating the dataframe

movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Adjusted Score"] = final_adj  # Homework
movies_info["Director"] = final_directors
movies_info["Synopsis"] = synopsis_text    # Homework
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

# Let's see how it looks
movies_info

Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,A Nightmare on Elm Street 3: Dream Warriors,1987,72,75042.0,Chuck Russell,"During a hallucinatory incident, young Kriste...","Heather Langenkamp, Patricia Arquette, Craig W...",
1,Creepers,1985,76,77316.0,Dario Argento,An American (Jennifer Connelly) at a Swiss fi...,"Jennifer Connelly, Donald Pleasence, Dalila Di...",No consensus yet.
2,Bram Stoker's Dracula,1992,75,79872.0,Francis Ford Coppola,Adaptation of Bram Stoker's classic vampire n...,"Gary Oldman, Winona Ryder, Anthony Hopkins, Ke...",No consensus yet.
3,Hellraiser,1987,72,75800.0,Clive Barker,Sexual deviant Frank (Sean Chapman) inadverte...,"Andrew Robinson, Clare Higgins, Ashley Laurenc...",No consensus yet.
4,It's Alive!,1974,70,70976.0,Larry Cohen,"Leaving their son, Chris (Daniel Holzman), wi...","John P. Ryan, Sharon Farrell, Andrew Duggan, G...","Tough and unpleasant,"
...,...,...,...,...,...,...,...,...
195,Alien,1979,98,109040.0,Ridley Scott,"In deep space, the crew of the commercial sta...","Tom Skerritt, Sigourney Weaver, John Hurt, Ver...","A modern classic,"
196,Us,2019,93,126862.0,Jordan Peele,"Accompanied by her husband, son and daughter,...","Lupita Nyong'o, Winston Duke, Elisabeth Moss, ...",No consensus yet.
197,The Cabinet of Dr. Caligari,1919,99,114088.0,Robert Wiene,"At a carnival in Germany, Francis (Friedrich ...","Werner Krauss, Conrad Veidt, Lil Dagover, Frie...",No consensus yet.
198,Get Out,2017,98,127744.0,Jordan Peele,"Now that Chris and his girlfriend, Rose, have...","Daniel Kaluuya, Allison Williams, Catherine Ke...",No consensus yet.


In [105]:
# By default pandas abbreviates any text beyond a certain length (as seen in the Cast and Consensus columns)

# We can change that by setting the maximum column width to -1,
# which means the column would be as wide as to display the whole text
pd.set_option('display.max_colwidth', -1)
movies_info

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,A Nightmare on Elm Street 3: Dream Warriors,1987,72,75042.0,Chuck Russell,"During a hallucinatory incident, young Kristen Parker (Patricia Arquette) has her wrists slashed by dream-stalking monster Freddy Krueger (Robert Englund)....","Heather Langenkamp, Patricia Arquette, Craig Wasson, Larry Fishburne",
1,Creepers,1985,76,77316.0,Dario Argento,An American (Jennifer Connelly) at a Swiss finishing school calls on insects to help a paralyzed scientist (Donald Pleasence) fight...,"Jennifer Connelly, Donald Pleasence, Dalila Di Lazzaro, Fausta Avelli",No consensus yet.
2,Bram Stoker's Dracula,1992,75,79872.0,Francis Ford Coppola,Adaptation of Bram Stoker's classic vampire novel. Gary Oldman plays Dracula whose lonely soul is determined to reunite with his...,"Gary Oldman, Winona Ryder, Anthony Hopkins, Keanu Reeves",No consensus yet.
3,Hellraiser,1987,72,75800.0,Clive Barker,Sexual deviant Frank (Sean Chapman) inadvertently opens a portal to hell when he tinkers with a box he bought while...,"Andrew Robinson, Clare Higgins, Ashley Laurence, Sean Chapman",No consensus yet.
4,It's Alive!,1974,70,70976.0,Larry Cohen,"Leaving their son, Chris (Daniel Holzman), with a family friend (William Wellman Jr.), Frank (John P. Ryan) and Lenore Davis...","John P. Ryan, Sharon Farrell, Andrew Duggan, Guy Stockwell","Tough and unpleasant,"
...,...,...,...,...,...,...,...,...
195,Alien,1979,98,109040.0,Ridley Scott,"In deep space, the crew of the commercial starship Nostromo is awakened from their cryo-sleep capsules halfway through their journey...","Tom Skerritt, Sigourney Weaver, John Hurt, Veronica Cartwright","A modern classic,"
196,Us,2019,93,126862.0,Jordan Peele,"Accompanied by her husband, son and daughter, Adelaide Wilson returns to the beachfront home where she grew up as a...","Lupita Nyong'o, Winston Duke, Elisabeth Moss, Tim Heidecker",No consensus yet.
197,The Cabinet of Dr. Caligari,1919,99,114088.0,Robert Wiene,"At a carnival in Germany, Francis (Friedrich Feher) and his friend Alan (Rudolf Lettinger) encounter the crazed Dr. Caligari (Werner...","Werner Krauss, Conrad Veidt, Lil Dagover, Friedrich Feher",No consensus yet.
198,Get Out,2017,98,127744.0,Jordan Peele,"Now that Chris and his girlfriend, Rose, have reached the meet-the-parents milestone of dating, she invites him for a weekend...","Daniel Kaluuya, Allison Williams, Catherine Keener, Bradley Whitford",No consensus yet.


## Exporting the data to CSV (comma-separated values) and excel files

In [106]:
# Write data to excel file
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

In [107]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)

In [108]:
# Index is set to False so that the index (0,1,2...) of each movie is not saved to the file (the index is purely internal)
# The header is set to True, so that the names of the columns are saved