# Scrape movie data from IMDB

## Import modules

In [1]:
import requests
import bs4
import pandas
import datetime

## Set movies to pull

- A multiple of 50.
- There are 50 movies per page.

to_pull = 500

In [3]:
pages = int(to_pull / 50)

In [4]:
pages

10

## Set pages list

In [5]:
pages_list = list(range(1,pages+1))

In [6]:
pages_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [7]:
url_genre= "sci_fi"

## Create dictionary to store data 

In [8]:
movie_dictionary = {
    "year":[],
    "title":[],
    "index":[],
    "rating":[],
    "number_votes":[],
    "runtime":[],
    "genre":[],
    "url_genre":[],
    "description":[],
    "date_extracted":[]
}

In [9]:
date = datetime.datetime.now()

In [10]:
clean_year_chars = "1234567890"


## Process

In [11]:
for page_number in pages_list:
    
    url = "https://www.imdb.com/search/title?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b9121fa8-b7bb-4a3e-8887-aab822e0b5a7&pf_rd_r=M7EN5FCB22Q4BM2PT32C&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=moviemeter&genres=" + str(url_genre) + "&explore=title_type,genres&sort=user_rating,desc&page=" + str(page_number) + "&view=advanced&title_type=tvMovie&ref_=adv_explore_rhs"

    request = requests.get(url)
    soup = bs4.BeautifulSoup(request.text,"lxml")
    movies = soup.find_all("div", {"class":"lister-item"})

    for movie in movies:

        #year
        year = movie.find("span", {"class":"lister-item-year"}).text.strip()
        for char in year:
            if char not in clean_year_chars:
                year = year.replace(char, "")
        
        year = int(year)

        #title
        title = movie.find("h3", {"class":"lister-item-header"}).find("a").text.strip()

        #index
        rank = movie.find("span",{"class":"lister-item-index"}).text.strip()

        #rating
        rating = movie.find("div", {"class":"inline-block"}).get("data-value")

        #number votes
        votes = movie.find("span", {"name":"nv"}).get("data-value")

        #runtime
        try:
            runtime = movie.find("span", {"class":"runtime"}).text.strip()
        except:
            runtime = None

        #genre
        genre = movie.find_all("span", {"class":"genre"})[0].text.strip()

        #description
        desc = movie.find_all("p", {"class":"text-muted"})[1].text.strip()
        
        #store
        movie_dictionary["year"].append(year)
        movie_dictionary["title"].append(title)
        movie_dictionary["index"].append(rank)
        movie_dictionary["rating"].append(rating)
        movie_dictionary["number_votes"].append(votes)
        movie_dictionary["runtime"].append(runtime)
        movie_dictionary["genre"].append(genre)
        movie_dictionary["url_genre"].append(url_genre)
        movie_dictionary["description"].append(desc)
        movie_dictionary["date_extracted"].append(date)

## Store dictionary in pandas dataframe

In [12]:
data = pandas.DataFrame(movie_dictionary)

In [13]:
data

Unnamed: 0,year,title,index,rating,number_votes,runtime,genre,url_genre,description,date_extracted
0,2016,Reset,1.,9.6,41,50 min,"Adventure, Drama, Family",sci_fi,When the Earth speaks... who will listen? RESE...,2018-06-14 10:07:42.056943
1,2013,Doctor Who 50th Anniversary Trailer,2.,9.2,12,1 min,Sci-Fi,sci_fi,"A rapid journey through time and space, create...",2018-06-14 10:07:42.056943
2,2011,Valley of the Scorned,3.,9.2,5,10 min,Sci-Fi,sci_fi,A sci-fi action drama that takes place in a po...,2018-06-14 10:07:42.056943
3,2015,The Most Unusual Mr Wood,4.,9.2,5,45 min,"Adventure, Drama, Sci-Fi",sci_fi,Jacob Wood is an ordinary young man with an ex...,2018-06-14 10:07:42.056943
4,1988,The Giftie,5.,9.1,11,60 min,"Comedy, Sci-Fi",sci_fi,What would happen if you could clone yourself?...,2018-06-14 10:07:42.056943
5,2010,"The Rusty Bucket Kids: Lincoln, Journey to 16",6.,9.1,17,,"Adventure, Family, Sci-Fi",sci_fi,All aboard with Steamy the Time-Travel-Express...,2018-06-14 10:07:42.056943
6,2016,The Walking Dead: The Journey So Far,7.,9,263,86 min,"Documentary, Drama, Horror",sci_fi,"From the moment Rick wakes up in the hospital,...",2018-06-14 10:07:42.056943
7,2016,Mars Project,8.,9,5,,"Sci-Fi, Thriller",sci_fi,A team of explorers arrive on Mars to join the...,2018-06-14 10:07:42.056943
8,2017,The Brotherhood,9.,9,10,84 min,"Action, Adventure, Drama",sci_fi,"A teenage operative, who struggles to build a ...",2018-06-14 10:07:42.056943
9,1988,Sobache serdtse,10.,8.9,5255,136 min,"Comedy, Drama, Sci-Fi",sci_fi,Professor Preobrazhensky and his colleague pla...,2018-06-14 10:07:42.056943
