<a href="https://colab.research.google.com/github/elvinaqa/Scraper-Text-Analyzer-/blob/master/Scrape_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import lxml
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from requests import get

In [2]:
url1 = "https://www.imdb.com/search/title?count=100&title_type=feature,tv_series&ref_=nv_wl_img_2"


In [3]:
class IMDB(object):
	"""docstring for IMDB"""
	def __init__(self, url):
		super(IMDB, self).__init__()
		page = get(url)

		self.soup = BeautifulSoup(page.content, 'lxml')

	def articleTitle(self):
		return self.soup.find("h1", class_="header").text.replace("\n","")

	def bodyContent(self):
		content = self.soup.find(id="main")
		return content.find_all("div", class_="lister-item mode-advanced")

	def movieData(self):
		movieFrame = self.bodyContent()
		movieTitle = []
		movieDate = []
		movieRunTime = []
		movieGenre = []
		movieRating = []
		movieScore = []
		movieDescription = []
		movieDirector = []
		movieStars = []
		movieVotes = []
		movieGross = []
		for movie in movieFrame:
			movieFirstLine = movie.find("h3", class_="lister-item-header")
			movieTitle.append(movieFirstLine.find("a").text)
			movieDate.append(re.sub(r"[()]","", movieFirstLine.find_all("span")[-1].text))
			try:
				movieRunTime.append(movie.find("span", class_="runtime").text[:-4])
			except:
				movieRunTime.append(np.nan)
			movieGenre.append(movie.find("span", class_="genre").text.rstrip().replace("\n","").split(","))
			try:
				movieRating.append(movie.find("strong").text)
			except:
				movieRating.append(np.nan)
			try:
				movieScore.append(movie.find("span", class_="metascore unfavorable").text.rstrip())
			except:
				movieScore.append(np.nan)
			movieDescription.append(movie.find_all("p", class_="text-muted")[-1].text.lstrip())
			movieCast = movie.find("p", class_="")

			try:
				casts = movieCast.text.replace("\n","").split('|')
				casts = [x.strip() for x in casts]
				casts = [casts[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
				movieDirector.append(casts[0])
				movieStars.append([x.strip() for x in casts[1].split(",")])
			except:
				casts = movieCast.text.replace("\n","").strip()
				movieDirector.append(np.nan)
				movieStars.append([x.strip() for x in casts.split(",")])

			movieNumbers = movie.find_all("span", attrs={"name": "nv"})

			if len(movieNumbers) == 2:
				movieVotes.append(movieNumbers[0].text)
				movieGross.append(movieNumbers[1].text)
			elif len(movieNumbers) == 1:
				movieVotes.append(movieNumbers[0].text)
				movieGross.append(np.nan)
			else:
				movieVotes.append(np.nan)
				movieGross.append(np.nan)

		movieData = [movieTitle, movieDate, movieRunTime, movieGenre, movieRating, movieScore, movieDescription,
							movieDirector, movieStars, movieVotes, movieGross]
		return movieData

In [4]:
if __name__ == '__main__':
	site1 = IMDB(url1)
	print("Subject: ", site1.articleTitle())
	data = site1.movieData()
	for i in range(len(data)):
		print(data[i][:]) #Print the data

Subject:  Feature Film/TV Series(Sorted by Popularity Ascending) 
['TENET天能', '眼鏡蛇道館', '黑袍糾察隊', '花木蘭', '黑豹', '魔鬼神探', 'Bill & Ted Face the Music', 'Raised by Wolves', '小子難纏', '雨傘學院', '007：生死交戰', '變種人', 'Sadak 2', '蝙蝠俠', 'Lovecraft Country', "I'm Thinking of Ending Things", 'Yellowstone', 'The Haunting of Bly Manor', 'After We Collided', '冰與火之歌：權力遊戲', '傳奇42號', '超自然檔案', 'Dune', '墮落', '阿比阿弟的冒險', '誓血五人組', '辦公室瘋雲', '實習醫生', 'Away', '闇', 'Project Power', 'Enola Holmes', '絕命毒師', '陰屍路', '地球百子', 'The Suicide Squad', '無恥之徒', '犯罪心理', '暴走曼哈頓', '維京傳奇', '禁錮之慾', 'Strike', '怪奇物語', '富家窮路', '浴血黑幫', '鋒迴路轉', '全面啟動', '追殺艾娃', '六人行', 'Young Wallander', 'Love, Guaranteed', 'Ted Lasso', '小丑', '廢柴聯盟', '復仇者聯盟：終局之戰', '曼達洛人', '激樂人心', '黑錢勝地', '俏妞報到', '冰血暴', '摩登家庭', '星際效應', 'The Legend of Korra', '搏擊王國', 'The Binge', 'Dirty John', '王冠', '美國恐怖故事', '飆風不歸路', 'The Karate Kid Part III', 'The Frozen Ground', '黑暗元素', '黑鏡', '宅男行不行', 'Black Panther II', '黑白正義', "Bill & Ted's Bogus Journey", '屍速列車：感染半島', 'The One and Only Ivan'

In [8]:
from google.colab import files
df.to_csv('filename.csv') 
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
df = pd.DataFrame(data) 


In [10]:
from bs4 import BeautifulSoup
import requests
import re

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)

for item in imdb:
    print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast'])



1 - 刺激1995 (1994) - Starring: Frank Darabont (dir.), Tim Robbins, Morgan Freeman
2 - 教父 (1972) - Starring: Francis Ford Coppola (dir.), Marlon Brando, Al Pacino
3 - 教父第二集 (1974) - Starring: Francis Ford Coppola (dir.), Al Pacino, Robert De Niro
4 - 黑暗騎士 (2008) - Starring: Christopher Nolan (dir.), Christian Bale, Heath Ledger
5 - 十二怒漢 (1957) - Starring: Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb
6 - 辛德勒的名單 (1993) - Starring: Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes
7 - 魔戒三部曲：王者再臨 (2003) - Starring: Peter Jackson (dir.), Elijah Wood, Viggo Mortensen
8 - 黑色追緝令 (1994) - Starring: Quentin Tarantino (dir.), John Travolta, Uma Thurman
9 - 黃昏三鏢客 (1966) - Starring: Sergio Leone (dir.), Clint Eastwood, Eli Wallach
1 -  魔戒首部曲：魔戒現身 (2001) - Starring: Peter Jackson (dir.), Elijah Wood, Ian McKellen
11 - 鬥陣俱樂部 (1999) - Starring: David Fincher (dir.), Brad Pitt, Edward Norton
12 - 阿甘正傳 (1994) - Starring: Robert Zemeckis (dir.), Tom Hanks, Robin Wright
13 - 全面啟動 (2010) - Starring: Chris

In [12]:
df = pd.DataFrame(imdb)

In [13]:
df

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,刺激1995,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.222690756864766,,/title/tt0111161/
1,教父,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.148900047198124,,/title/tt0068646/
2,教父第二集,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.98117325210481,,/title/tt0071562/
3,黑暗騎士,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.973070780802956,,/title/tt0468569/
4,十二怒漢,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.92999270997998,,/title/tt0050083/
...,...,...,...,...,...,...,...
245,魔鬼終結者,1984,246,"James Cameron (dir.), Arnold Schwarzenegger, L...",8.009104496890966,,/title/tt0088247/
246,紅色情深,1994,247,"Krzysztof Kieslowski (dir.), Irène Jacob, Jean...",8.007454726993224,,/title/tt0111495/
247,阿拉丁,1992,248,"Ron Clements (dir.), Scott Weinger, Robin Will...",8.007131678148278,,/title/tt0103639/
248,橘子收成時,2013,249,"Zaza Urushadze (dir.), Lembit Ulfsak, Elmo Nüg...",8.007099048362166,,/title/tt2991224/
