#### Set up

In [1]:
pip install requests_html

Note: you may need to restart the kernel to use updated packages.


In [2]:
# load packages
#import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup

In [3]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/2/"

In [4]:
# sending a request to the webpage
#response = requests.get(base_site)
session= HTMLSession()
response = session.get(base_site)
response.status_code

200

In [5]:
html= response.content

#### Choosing Parser

In [6]:
# convert the HTML to a Beautiful Soup object
soup= BeautifulSoup(html, 'html.parser')

In [7]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [8]:
#Sometimes html parser doesnot parse properly then we use lxml parser

In [9]:
# convert the HTML to a Beautiful Soup object
soup= BeautifulSoup(html, 'lxml')

In [10]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

##### A word of caution

In [11]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

#### Finding an element containing all the data

In [12]:
divs = soup.find_all("div", {"class": "col col-left-center col-full-xs article_body 140-essential-action-movies-to-watch-now"})
divs

[<div class="col col-left-center col-full-xs article_body 140-essential-action-movies-to-watch-now" id="article_main_body">
 <div class="panel-rt panel-box article_body">
 <div id="social-tools-widget">
 <a href="#"><img class="social-tools-facebook-like fb_like" src="https://static.rottentomatoes.com/static/images/social/social_fb_like.png"/></a>
 <a href="#"><img class="social-tools-facebook-share" src="https://static.rottentomatoes.com/static/images/social/social_fb_share.png"/></a>
 <a href="#"><img class="social-tools-twitter" src="https://static.rottentomatoes.com/static/images/social/social_twitter.png"/></a>
 <a href="#"><img class="social-tools-googleplus" src="https://static.rottentomatoes.com/static/images/social/social_google.png"/></a>
 </div>
 <div class="articleContentBody">
 <div class="content-image"><img class="aligncenter wp-image-114643 size-full" src="https://s3-us-west-2.amazonaws.com/flx-editorial-wordpress/wp-content/uploads/2019/06/06180032/RT_140_ESSENTIAL_ACT

In [13]:
divs[0].find('h2')

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>

In [14]:
divs[0].find('h2').text

'Running Scared (1986)  60%'

In [15]:
# Extracting all 'h2' tags
headings = divs[0].find_all('h2') 
headings

[<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/equilibrium/">Equilibrium</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">40%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/hero/">Hero</a> <span class="subtle start-year">(2004)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/1017666-road_house/">Road House</a> <span class="subtle start-year">(1989)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">39%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/unstoppable-2010/">Unstoppable</a> <span class="subtle st

In [16]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['Running Scared (1986)  60%',
 'Equilibrium (2002)  40%',
 'Hero (2004)  95%',
 'Road House (1989)  39%',
 'Unstoppable (2010)  86%',
 'Shaft (1971)  88%',
 'The Villainess (Ak-Nyeo) (2017)  84%',
 'Highlander (1986)  69%',
 'Die Hard 2 (1990)  68%',
 'National Treasure (2004)  46%',
 'The Protector (Tom yum goong) (Warrior King) (2005)  53%',
 'Revenge (2018)  92%',
 'El Mariachi (1993)  93%',
 'A Touch of Zen (1969)  96%',
 'Top Gun (1986)  54%',
 'Con Air (1997)  55%',
 'The Expendables 2 (2012)  68%',
 'The Mummy (1999)  60%',
 'Mr. & Mrs. Smith (2005)  60%',
 'Rush Hour (1998)  60%',
 'The Equalizer (2014)  59%',
 'Captain America: Civil War (2016)  91%',
 'Air Force One (1997)  76%',
 'Bloodsport (1988)  40%',
 'Blade (1998)  55%',
 'Bad Boys (1995)  43%',
 'Die Hard: With a Vengeance (1995)  52%',
 'The Running Man (1987)  63%',
 'Code of Silence (1985)  63%',
 "Shoot 'Em Up (2007)  67%",
 'Crank (2006)  61%',
 'Machete (2010)  72%',
 'Drive (2011)  92%',
 'Batman (1989)  72%',

#### Title

In [17]:
title=headings[0].find('a')
title

<a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a>

In [18]:
title= [i.find('a').text for i in headings]
title

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess (Ak-Nyeo)',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector (Tom yum goong) (Warrior King)',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard: With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's The Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad: The Enemy Within',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story (Ging chaat goo si) (Police Force)',
 '

In [19]:
title2= [i.find('a').string for i in headings]
title2

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess (Ak-Nyeo)',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector (Tom yum goong) (Warrior King)',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard: With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's The Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad: The Enemy Within',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story (Ging chaat goo si) (Police Force)',
 '

In [20]:
title= title[:-1]
title

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess (Ak-Nyeo)',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector (Tom yum goong) (Warrior King)',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard: With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's The Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad: The Enemy Within',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story (Ging chaat goo si) (Police Force)',
 '

In [21]:
len(title)

140

#### Year

In [22]:
y= headings[0].find('span', class_ = 'subtle start-year').text
y

'(1986)'

In [23]:
year= [heading.find('span', {'class' : 'start-year'}) for heading in headings]
year= year[:-1]
year

[<span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(1989)</span>,
 <span class="subtle start-year">(2010)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(2018)</span>,
 <span class="subtle start-year">(1993)</span>,
 <span class="subtle start-year">(1969)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1997)</span>,
 <span class="subtle start-year">(2012)</span>,
 <span class="subtle start-year">(1999)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1998)</span>,
 <span class="subtle start-year">(2014)<

In [24]:
years= [i.string for i in year]
years

['(1986)',
 '(2002)',
 '(2004)',
 '(1989)',
 '(2010)',
 '(1971)',
 '(2017)',
 '(1986)',
 '(1990)',
 '(2004)',
 '(2005)',
 '(2018)',
 '(1993)',
 '(1969)',
 '(1986)',
 '(1997)',
 '(2012)',
 '(1999)',
 '(2005)',
 '(1998)',
 '(2014)',
 '(2016)',
 '(1997)',
 '(1988)',
 '(1998)',
 '(1995)',
 '(1995)',
 '(1987)',
 '(1985)',
 '(2007)',
 '(2006)',
 '(2010)',
 '(2011)',
 '(1989)',
 '(1992)',
 '(1996)',
 '(1968)',
 '(2008)',
 '(1978)',
 '(1998)',
 '(1988)',
 '(1993)',
 '(2012)',
 '(2007)',
 '(1979)',
 '(1997)',
 '(2011)',
 '(1991)',
 '(1996)',
 '(2014)',
 '(2009)',
 '(2007)',
 '(1994)',
 '(1993)',
 '(2016)',
 '(1985)',
 '(2001)',
 '(2015)',
 '(1997)',
 '(1986)',
 '(2017)',
 '(1995)',
 '(2006)',
 '(1984)',
 '(2005)',
 '(2004)',
 '(2001)',
 '(1981)',
 '(2000)',
 '(2004)',
 '(2011)',
 '(1992)',
 '(1989)',
 '(2005)',
 '(2010)',
 '(2008)',
 '(2018)',
 '(2017)',
 '(1964)',
 '(1976)',
 '(2017)',
 '(1972)',
 '(2014)',
 '(2005)',
 '(1971)',
 '(2015)',
 '(1990)',
 '(1996)',
 '(1971)',
 '(2014)',
 '(2003)',

In [25]:
#This is because the section is rendered by the browser via JavaScript. 
#So when you use requests you only get the HTML content of the page which doesn't have what you need. 
#You should use for example selenium (or requests-html) if you want to parse page with elements rendered by web browser.

In [26]:
years[0].strip('()')

'1986'

In [27]:
years= [i.strip('()') for i in years]
years

['1986',
 '2002',
 '2004',
 '1989',
 '2010',
 '1971',
 '2017',
 '1986',
 '1990',
 '2004',
 '2005',
 '2018',
 '1993',
 '1969',
 '1986',
 '1997',
 '2012',
 '1999',
 '2005',
 '1998',
 '2014',
 '2016',
 '1997',
 '1988',
 '1998',
 '1995',
 '1995',
 '1987',
 '1985',
 '2007',
 '2006',
 '2010',
 '2011',
 '1989',
 '1992',
 '1996',
 '1968',
 '2008',
 '1978',
 '1998',
 '1988',
 '1993',
 '2012',
 '2007',
 '1979',
 '1997',
 '2011',
 '1991',
 '1996',
 '2014',
 '2009',
 '2007',
 '1994',
 '1993',
 '2016',
 '1985',
 '2001',
 '2015',
 '1997',
 '1986',
 '2017',
 '1995',
 '2006',
 '1984',
 '2005',
 '2004',
 '2001',
 '1981',
 '2000',
 '2004',
 '2011',
 '1992',
 '1989',
 '2005',
 '2010',
 '2008',
 '2018',
 '2017',
 '1964',
 '1976',
 '2017',
 '1972',
 '2014',
 '2005',
 '1971',
 '2015',
 '1990',
 '1996',
 '1971',
 '2014',
 '2003',
 '1993',
 '2018',
 '2010',
 '1995',
 '2002',
 '2019',
 '2012',
 '2002',
 '2010',
 '1997',
 '1985',
 '2008',
 '2011',
 '2011',
 '1987',
 '1996',
 '1987',
 '2017',
 '2006',
 '2017',
 

In [28]:
years= [int(i) for i in years]
years

[1986,
 2002,
 2004,
 1989,
 2010,
 1971,
 2017,
 1986,
 1990,
 2004,
 2005,
 2018,
 1993,
 1969,
 1986,
 1997,
 2012,
 1999,
 2005,
 1998,
 2014,
 2016,
 1997,
 1988,
 1998,
 1995,
 1995,
 1987,
 1985,
 2007,
 2006,
 2010,
 2011,
 1989,
 1992,
 1996,
 1968,
 2008,
 1978,
 1998,
 1988,
 1993,
 2012,
 2007,
 1979,
 1997,
 2011,
 1991,
 1996,
 2014,
 2009,
 2007,
 1994,
 1993,
 2016,
 1985,
 2001,
 2015,
 1997,
 1986,
 2017,
 1995,
 2006,
 1984,
 2005,
 2004,
 2001,
 1981,
 2000,
 2004,
 2011,
 1992,
 1989,
 2005,
 2010,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2005,
 1971,
 2015,
 1990,
 1996,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2010,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2001,
 1987,
 2007,
 1990,
 1982,
 1995,
 2012,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]

In [29]:
len(years)

140

#### Rating

In [30]:
rate= [i.find('span', class_ = 'tMeterScore') for i in headings]
rate= rate[:-1]

In [79]:
rate= [i.string for i in rate]
rate

['60%',
 '40%',
 '95%',
 '39%',
 '86%',
 '88%',
 '84%',
 '69%',
 '68%',
 '46%',
 '53%',
 '92%',
 '93%',
 '96%',
 '54%',
 '55%',
 '68%',
 '60%',
 '60%',
 '60%',
 '59%',
 '91%',
 '76%',
 '40%',
 '55%',
 '43%',
 '52%',
 '63%',
 '63%',
 '67%',
 '61%',
 '72%',
 '92%',
 '72%',
 '79%',
 '65%',
 '97%',
 '71%',
 '94%',
 '68%',
 '86%',
 '68%',
 '92%',
 '91%',
 '89%',
 '63%',
 '93%',
 '69%',
 '69%',
 '91%',
 '58%',
 '60%',
 '70%',
 '62%',
 '51%',
 '93%',
 '73%',
 '74%',
 '71%',
 '77%',
 '78%',
 '80%',
 '80%',
 '82%',
 '85%',
 '86%',
 '91%',
 '86%',
 '87%',
 '93%',
 '95%',
 '88%',
 '88%',
 '90%',
 '93%',
 '94%',
 '90%',
 '93%',
 '98%',
 '98%',
 '93%',
 '92%',
 '90%',
 '82%',
 '98%',
 '81%',
 '88%',
 '96%',
 '89%',
 '90%',
 '85%',
 '96%',
 '97%',
 '87%',
 '77%',
 '90%',
 '94%',
 '79%',
 '83%',
 '85%',
 '92%',
 '91%',
 '94%',
 '93%',
 '77%',
 '82%',
 '66%',
 '89%',
 '89%',
 '95%',
 '93%',
 '100%',
 '98%',
 '80%',
 '94%',
 '71%',
 '87%',
 '93%',
 '100%',
 '76%',
 '85%',
 '73%',
 '94%',
 '83%',
 '86%'

In [80]:
r= headings[0].find('span', class_ = 'tMeterScore').text
r

'60%'

In [81]:
rate=[ int(i.strip('%')) for i in rate]
rate

[60,
 40,
 95,
 39,
 86,
 88,
 84,
 69,
 68,
 46,
 53,
 92,
 93,
 96,
 54,
 55,
 68,
 60,
 60,
 60,
 59,
 91,
 76,
 40,
 55,
 43,
 52,
 63,
 63,
 67,
 61,
 72,
 92,
 72,
 79,
 65,
 97,
 71,
 94,
 68,
 86,
 68,
 92,
 91,
 89,
 63,
 93,
 69,
 69,
 91,
 58,
 60,
 70,
 62,
 51,
 93,
 73,
 74,
 71,
 77,
 78,
 80,
 80,
 82,
 85,
 86,
 91,
 86,
 87,
 93,
 95,
 88,
 88,
 90,
 93,
 94,
 90,
 93,
 98,
 98,
 93,
 92,
 90,
 82,
 98,
 81,
 88,
 96,
 89,
 90,
 85,
 96,
 97,
 87,
 77,
 90,
 94,
 79,
 83,
 85,
 92,
 91,
 94,
 93,
 77,
 82,
 66,
 89,
 89,
 95,
 93,
 100,
 98,
 80,
 94,
 71,
 87,
 93,
 100,
 76,
 85,
 73,
 94,
 83,
 86,
 97,
 81,
 92,
 82,
 95,
 86,
 86,
 97,
 95,
 97,
 94,
 87,
 93,
 93,
 97]

In [34]:
rate.append(0)

In [35]:
#rate=[]
#for i in headings:
 #   rate.append(i.find('span', class_ = 'tMeterScore'))

In [73]:
len(rate)

140

In [72]:
rate= rate[:-1]

#### Other data

##### Critics Consensus

In [36]:
con= divs[0].find_all('div', class_= 'col-sm-24')
con

[<div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.188% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.</div>
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared/" target="_top"> [

In [37]:
critic= con[0].find('div',{'class':'info critics-consensus'}) 
critic.text

'Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [38]:
critics=[ i.find('div',{'class':'info critics-consensus'}).text for i in con]
critics

['Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Critics Consensus: Equilibrium is a reheated mishmash of other sci-fi movies.',
 'Critics Consensus: With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Critics Consensus: Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "Critics Consensus: As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'Critics Consensus: This is the man that would risk his neck for his brother, man. Can you dig it?',
 'Critics Consensus: The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bl

In [39]:
c= con[0].find('div',{'class':'info critics-consensus'}) 
c.contents

[<span class="descriptor">Critics Consensus:</span>,
 ' Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.']

In [40]:
c=c.contents[1].strip()

In [41]:
c

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [42]:
common_phrase = 'Critics Consensus: '

In [43]:
common_length=len(common_phrase)

In [44]:
critic.text[19:]

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [45]:
critics=[ i.find('div',{'class':'info critics-consensus'}).text[common_length:] for i in con]
critics

['Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Equilibrium is a reheated mishmash of other sci-fi movies.',
 'With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'This is the man that would risk his neck for his brother, man. Can you dig it?',
 'The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bloody niche for itself in modern Korean action cinema.',
 "People hate Highlander because it's cheesy, bombastic, and absurd. And peop

##### Director

In [46]:
director= con[0].find('div',class_='info director')
director

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="/celebrity/peter_hyams/">Peter Hyams</a></div>

In [47]:
director.text

'\nDirected By: Peter Hyams'

In [48]:
directors= [i.find('div',class_='info director').text for i in con]
directors

['\nDirected By: Peter Hyams',
 '\nDirected By: Kurt Wimmer',
 '\nDirected By: Zhang Yimou',
 '\nDirected By: Rowdy Herrington',
 '\nDirected By: Tony Scott',
 '\nDirected By: Gordon Parks',
 '\nDirected By: Jung Byung-gil',
 '\nDirected By: Russell Mulcahy',
 '\nDirected By: Renny Harlin',
 '\nDirected By: Jon Turteltaub',
 '\nDirected By: Prachya Pinkaew',
 '\nDirected By: Coralie Fargeat',
 '\nDirected By: Robert Rodriguez',
 '\nDirected By: King Hu',
 '\nDirected By: Tony Scott',
 '\nDirected By: Simon West',
 '\nDirected By: Simon West',
 '\nDirected By: Stephen Sommers',
 '\nDirected By: Doug Liman',
 '\nDirected By: Brett Ratner',
 '\nDirected By: Antoine Fuqua',
 '\nDirected By: Anthony Russo, Joe Russo',
 '\nDirected By: Wolfgang Petersen',
 '\nDirected By: Newt Arnold, Newton Arnold',
 '\nDirected By: Stephen Norrington',
 '\nDirected By: Michael Bay',
 '\nDirected By: John McTiernan',
 '\nDirected By: Paul Michael Glaser',
 '\nDirected By: Andrew Davis',
 '\nDirected By: Mic

In [49]:
cp='\nDirected By: '

In [50]:
cl=len(cp)

In [51]:
directors= [i.find('div',class_='info director').text[cl:] for i in con]
directors

['Peter Hyams',
 'Kurt Wimmer',
 'Zhang Yimou',
 'Rowdy Herrington',
 'Tony Scott',
 'Gordon Parks',
 'Jung Byung-gil',
 'Russell Mulcahy',
 'Renny Harlin',
 'Jon Turteltaub',
 'Prachya Pinkaew',
 'Coralie Fargeat',
 'Robert Rodriguez',
 'King Hu',
 'Tony Scott',
 'Simon West',
 'Simon West',
 'Stephen Sommers',
 'Doug Liman',
 'Brett Ratner',
 'Antoine Fuqua',
 'Anthony Russo, Joe Russo',
 'Wolfgang Petersen',
 'Newt Arnold, Newton Arnold',
 'Stephen Norrington',
 'Michael Bay',
 'John McTiernan',
 'Paul Michael Glaser',
 'Andrew Davis',
 'Michael Davis',
 'Mark Neveldine, Brian Taylor',
 'Ethan Maniquis, Robert Rodriguez',
 'Nicolas Winding Refn',
 'Tim Burton',
 'Andrew Davis',
 'Roland Emmerich',
 'Peter Yates',
 'Timur Bekmambetov',
 'Richard Donner',
 'John Frankenheimer',
 'John Carpenter',
 '',
 '',
 'Edgar Wright',
 'Walter Hill',
 'Paul Verhoeven',
 'José Padilha',
 'Kathryn Bigelow',
 'Renny Harlin',
 'Adam Wingard',
 'Pierre Morel',
 'Zack Snyder',
 'James Cameron, Tony Sco

In [52]:
len(directors)

140

##### Cast Info

In [53]:
cast= con[0].find('div',class_='info cast')
cast

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="/celebrity/gregory_hines/">Gregory Hines</a>, <a class="" href="/celebrity/billy_crystal/">Billy Crystal</a>, <a class="" href="/celebrity/jimmy_smits/">Jimmy Smits</a>, <a class="" href="/celebrity/steven_bauer/">Steven Bauer</a></div>

In [54]:
cast.text

'\nStarring: Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer'

In [55]:
casts= [i.find('div',class_='info cast').text for i in con]
casts

['\nStarring: Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 '\nStarring: Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 '\nStarring: Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Daoming Chen',
 '\nStarring: Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 '\nStarring: Denzel Washington, Chris Pine, Rosario Dawson, Ethan Suplee',
 '\nStarring: Richard Roundtree, Moses Gunn, Gwen Mitchell, Christopher St. John',
 '\nStarring: Ok-bin Kim, Kim Seo-hyung, Shin Ha-kyun, Bang Sung-jun',
 '\nStarring: Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 '\nStarring: Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 '\nStarring: Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 '\nStarring: Tony Jaa, Petchtai Wongkamlao, Bongkoj Khongmalai, Bongkoo Kongmalai',
 '\nStarring: Matilda Anna Ingrid Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 '\nStarring: Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter M

In [56]:
cp2='\nStarring: '

In [57]:
cl2= len(cp2)

In [58]:
casts= [i.find('div',class_='info cast').text[cl2:] for i in con]
casts

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Daoming Chen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Ethan Suplee',
 'Richard Roundtree, Moses Gunn, Gwen Mitchell, Christopher St. John',
 'Ok-bin Kim, Kim Seo-hyung, Shin Ha-kyun, Bang Sung-jun',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Petchtai Wongkamlao, Bongkoj Khongmalai, Bongkoo Kongmalai',
 'Matilda Anna Ingrid Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Tien Peng',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkov

##### Adjusted Score

In [59]:
ads= con[0].find('div','info countdown-adjusted-score')
ads.text

'Adjusted Score: 61.188% '

In [60]:
adscore=[i.find('div','info countdown-adjusted-score').text for i in con]
adscore

['Adjusted Score: 61.188% ',
 'Adjusted Score: 41.993% ',
 'Adjusted Score: 100.762% ',
 'Adjusted Score: 41.995% ',
 'Adjusted Score: 91.477% ',
 'Adjusted Score: 92.029% ',
 'Adjusted Score: 86.911% ',
 'Adjusted Score: 71.938% ',
 'Adjusted Score: 72.296% ',
 'Adjusted Score: 50.848% ',
 'Adjusted Score: 55.434% ',
 'Adjusted Score: 97.073% ',
 'Adjusted Score: 94.588% ',
 'Adjusted Score: 98.516% ',
 'Adjusted Score: 59.126% ',
 'Adjusted Score: 58.685% ',
 'Adjusted Score: 72.3% ',
 'Adjusted Score: 63.981% ',
 'Adjusted Score: 67.127% ',
 'Adjusted Score: 63.723% ',
 'Adjusted Score: 66.568% ',
 'Adjusted Score: 107.341% ',
 'Adjusted Score: 78.965% ',
 'Adjusted Score: 41.235% ',
 'Adjusted Score: 60.599% ',
 'Adjusted Score: 46.861% ',
 'Adjusted Score: 55.784% ',
 'Adjusted Score: 65.931% ',
 'Adjusted Score: 63.696% ',
 'Adjusted Score: 72.115% ',
 'Adjusted Score: 63.637% ',
 'Adjusted Score: 77.443% ',
 'Adjusted Score: 100.122% ',
 'Adjusted Score: 77.719% ',
 'Adjusted Sc

In [61]:
cp3='Adjusted Score: '

In [62]:
cl3=len(cp3)

In [63]:
adscore=[i.find('div','info countdown-adjusted-score').text[cl3:] for i in con]
adscore

['61.188% ',
 '41.993% ',
 '100.762% ',
 '41.995% ',
 '91.477% ',
 '92.029% ',
 '86.911% ',
 '71.938% ',
 '72.296% ',
 '50.848% ',
 '55.434% ',
 '97.073% ',
 '94.588% ',
 '98.516% ',
 '59.126% ',
 '58.685% ',
 '72.3% ',
 '63.981% ',
 '67.127% ',
 '63.723% ',
 '66.568% ',
 '107.341% ',
 '78.965% ',
 '41.235% ',
 '60.599% ',
 '46.861% ',
 '55.784% ',
 '65.931% ',
 '63.696% ',
 '72.115% ',
 '63.637% ',
 '77.443% ',
 '100.122% ',
 '77.719% ',
 '80.433% ',
 '68.813% ',
 '100.515% ',
 '78.621% ',
 '101.296% ',
 '71.205% ',
 '90.631% ',
 '71.899% ',
 '105.431% ',
 '97.71% ',
 '91.957% ',
 '66.541% ',
 '93.573% ',
 '73.214% ',
 '71.661% ',
 '94.933% ',
 '63.263% ',
 '67.812% ',
 '72.843% ',
 '64.605% ',
 '56.151% ',
 '94.41% ',
 '76.65% ',
 '83.679% ',
 '74.685% ',
 '80.813% ',
 '92.966% ',
 '83.244% ',
 '83.449% ',
 '85.647% ',
 '88.121% ',
 '92.645% ',
 '93.651% ',
 '90.941% ',
 '87.976% ',
 '100.886% ',
 '98.509% ',
 '61.183% ',
 '93.265% ',
 '96.173% ',
 '44.149% ',
 '104.691% ',
 '90.412%

In [64]:
adscore=[float(i.strip('% ')) for i in adscore]
adscore

[61.188,
 41.993,
 100.762,
 41.995,
 91.477,
 92.029,
 86.911,
 71.938,
 72.296,
 50.848,
 55.434,
 97.073,
 94.588,
 98.516,
 59.126,
 58.685,
 72.3,
 63.981,
 67.127,
 63.723,
 66.568,
 107.341,
 78.965,
 41.235,
 60.599,
 46.861,
 55.784,
 65.931,
 63.696,
 72.115,
 63.637,
 77.443,
 100.122,
 77.719,
 80.433,
 68.813,
 100.515,
 78.621,
 101.296,
 71.205,
 90.631,
 71.899,
 105.431,
 97.71,
 91.957,
 66.541,
 93.573,
 73.214,
 71.661,
 94.933,
 63.263,
 67.812,
 72.843,
 64.605,
 56.151,
 94.41,
 76.65,
 83.679,
 74.685,
 80.813,
 92.966,
 83.244,
 83.449,
 85.647,
 88.121,
 92.645,
 93.651,
 90.941,
 87.976,
 100.886,
 98.509,
 61.183,
 93.265,
 96.173,
 44.149,
 104.691,
 90.412,
 110.634,
 104.218,
 102.296,
 112.475,
 92.809,
 101.765,
 86.608,
 104.292,
 91.225,
 90.531,
 98.254,
 93.658,
 102.588,
 92.174,
 101.933,
 119.301,
 97.936,
 81.942,
 97.346,
 117.365,
 84.563,
 88.285,
 77.821,
 97.199,
 92.946,
 107.214,
 100.563,
 82.955,
 86.273,
 69.339,
 94.327,
 100.331,
 10

In [65]:
len(adscore)

140

##### Synopsis

In [66]:
syn= con[0].find('div',class_='info synopsis')
syn=syn.contents[1]
syn

' Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...'

In [67]:
synopsis=[i.find('div',class_='info synopsis').contents[1] for i in con]
synopsis

[' Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...',
 ' In the nation of Libria, there is always peace among men. The rules of the Librian system are simple. If...',
 " Hero is two-time Academy Award nominee Zhang Yimou's directorial attempt at exploring the concept of a Chinese hero. During the...",
 ' Dalton (Swayze) is a true gentleman with a degree in philosophy from NYU. He also has a flip side -...',
 ' In this action thriller from director Tony Scott, rookie train operator Will (Chris Pine) and grizzled veteran engineer Frank (Denzel...',
 ' Shaft, a highly successful film, spawned an industry of sequels and imitations. The daughter (Sherri Brewer) of Bumpy Jones (Moses...',
 ' Since she was a little girl, Sook-hee was raised to be a deadly assassin. She gladly accepts the chance to...',
 ' Among humans for centuries, an immortal specie existed. Connor MacLeod is a member of this specie. Unaware 

#### Representing the data in structured form

In [68]:
import pandas as pd

In [82]:
movie_info= pd.DataFrame()
movie_info

In [83]:
movie_info['Movie_Title']= title
movie_info['Year']= years
movie_info['Score']=rate
movie_info['Adjusted_Score']= adscore
movie_info['Director']= directors
movie_info['Cast']=casts
movie_info['Synopsis']= synopsis
movie_info['Cnsensus']=critics

In [84]:
len(movie_info)

140

In [85]:
movie_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 8 columns):
Movie_Title       140 non-null object
Year              140 non-null int64
Score             140 non-null int64
Adjusted_Score    140 non-null float64
Director          140 non-null object
Cast              140 non-null object
Synopsis          140 non-null object
Cnsensus          140 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 8.8+ KB


In [86]:
movie_info.head()

Unnamed: 0,Movie_Title,Year,Score,Adjusted_Score,Director,Cast,Synopsis,Cnsensus
0,Running Scared,1986,60,61.188,Peter Hyams,"Gregory Hines, Billy Crystal, Jimmy Smits, Ste...","Distinguished by a sharp, witty dialogue betw...",Running Scared struggles to strike a consisten...
1,Equilibrium,2002,40,41.993,Kurt Wimmer,"Christian Bale, Emily Watson, Taye Diggs, Angu...","In the nation of Libria, there is always peac...",Equilibrium is a reheated mishmash of other sc...
2,Hero,2004,95,100.762,Zhang Yimou,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Da...",Hero is two-time Academy Award nominee Zhang ...,With death-defying action sequences and epic h...
3,Road House,1989,39,41.995,Rowdy Herrington,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ...",Dalton (Swayze) is a true gentleman with a de...,Whether Road House is simply bad or so bad it'...
4,Unstoppable,2010,86,91.477,Tony Scott,"Denzel Washington, Chris Pine, Rosario Dawson,...",In this action thriller from director Tony Sc...,"As fast, loud, and relentless as the train at ..."


#### Exporting the data to CSV (comma-separated values) and excel files

In [88]:
# Write data to excel file
movie_info.to_excel("movies_info.xlsx", index = False, header = True)

In [89]:
# or write data to CSV file
movie_info.to_csv("movies_info.csv", index = False, header = True)

In [None]:
# Index is set to False so that the index (0,1,2...) of each movie is not saved to the file (the index is purely internal)
# The header is set to True, so that the names of the columns are saved