# Scrape movie data from IMDB

## Import modules

In [1]:
import requests
import bs4
import pandas
import datetime

## Set movies to pull

- A multiple of 50.
- There are 50 movies per page.

In [2]:
to_pull = 150

In [3]:
pages = int(to_pull / 50)

In [4]:
pages

3

## Create genres list

In [5]:
url = "https://www.imdb.com/search/title?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b9121fa8-b7bb-4a3e-8887-aab822e0b5a7&pf_rd_r=M7EN5FCB22Q4BM2PT32C&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=moviemeter&genres=sci_fi&explore=title_type,genres&sort=user_rating,desc&page=1&title_type=tvMovie&view=advanced&ref_=adv_explore_rhs"
soup = bs4.BeautifulSoup(requests.get(url).text, 'lxml')
genres = soup.find("div", {"class":"aux-content-widget-2"})
links = genres.find_all("a")

genre_list = []

for link in links:
    href = link.get("href")
    broken_down = href.split("&")
    for item in broken_down:
        if "genres=" in item:
            if item not in genre_list:
                genre_list.append(item)
                
for index in range(len(genre_list)):
    print(index, genre_list[index])

0 genres=sci_fi
1 genres=Sci-Fi
2 genres=Action
3 genres=Drama
4 genres=Thriller
5 genres=Horror
6 genres=Adventure
7 genres=Comedy
8 genres=Fantasy
9 genres=Animation
10 genres=Family
11 genres=Mystery
12 genres=Documentary
13 genres=Romance
14 genres=Crime
15 genres=Musical
16 genres=Music
17 genres=War
18 genres=History
19 genres=Sport
20 genres=Western
21 genres=Biography
22 genres=Reality-TV
23 genres=Talk-Show
24 genres=News
25 genres=Game-Show


## Set pages list

In [6]:
pages_list = list(range(1,pages+1))

In [7]:
pages_list

[1, 2, 3]

##  Set genre parameters list

In [8]:
genre_parameters_list = [genre_list[14], genre_list[13], genre_list[7]]

## Create dictionary to store data 

In [9]:
movie_dictionary = {
    "year":[],
    "title":[],
    "index":[],
    "rating":[],
    "number_votes":[],
    "runtime":[],
    "genre":[],
    "url_genre":[],
    "description":[],
    "date_extracted":[]
}

In [10]:
date = datetime.datetime.now()

In [11]:
clean_year_chars = "1234567890"

## Process

In [12]:
for genre_parameter in genre_parameters_list:
    for page_number in pages_list:

        url = "https://www.imdb.com/search/title?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b9121fa8-b7bb-4a3e-8887-aab822e0b5a7&pf_rd_r=M7EN5FCB22Q4BM2PT32C&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=moviemeter&"+ genre_parameter + "&explore=title_type,genres&sort=user_rating,desc&page=" + str(page_number) + "&view=advanced&title_type=tvMovie&ref_=adv_explore_rhs"

        request = requests.get(url)
        soup = bs4.BeautifulSoup(request.text,"lxml")
        movies = soup.find_all("div", {"class":"lister-item"})

        for movie in movies:

            #year
            year = movie.find("span", {"class":"lister-item-year"}).text.strip()
            for char in year:
                if char not in clean_year_chars:
                    year = year.replace(char, "")

            year = int(year)

            #title
            title = movie.find("h3", {"class":"lister-item-header"}).find("a").text.strip()

            #index
            rank = movie.find("span",{"class":"lister-item-index"}).text.strip()

            #rating
            rating = movie.find("div", {"class":"inline-block"}).get("data-value")

            #number votes
            votes = movie.find("span", {"name":"nv"}).get("data-value")

            #runtime
            try:
                runtime = movie.find("span", {"class":"runtime"}).text.strip()
            except:
                runtime = None

            #genre
            genre = movie.find_all("span", {"class":"genre"})[0].text.strip()

            #description
            desc = movie.find_all("p", {"class":"text-muted"})[1].text.strip()

            #store
            movie_dictionary["year"].append(year)
            movie_dictionary["title"].append(title)
            movie_dictionary["index"].append(rank)
            movie_dictionary["rating"].append(rating)
            movie_dictionary["number_votes"].append(votes)
            movie_dictionary["runtime"].append(runtime)
            movie_dictionary["genre"].append(genre)
            movie_dictionary["url_genre"].append(genre_parameter)
            movie_dictionary["description"].append(desc)
            movie_dictionary["date_extracted"].append(date)

## Store dictionary in pandas dataframe

In [13]:
data = pandas.DataFrame(movie_dictionary)

In [14]:
data

Unnamed: 0,year,title,index,rating,number_votes,runtime,genre,url_genre,description,date_extracted
0,1954,Zwischenfall im Roxy,1.,10,6,45 min,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
1,1964,Usluga tacna i solidna,2.,10,9,,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
2,1960,Natjecaj za crnu pricu,3.,9.9,9,,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
3,1965,Party im Zwielicht,4.,9.8,5,65 min,"Crime, Drama",genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
4,1953,Kopf oder Zahl,5.,9.6,7,45 min,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
5,1964,Ein langer Tag,6.,9.6,5,51 min,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
6,1967,Das Attentat - L.D. Trotzki,7.,9.5,13,155 min,"Crime, History",genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
7,1965,Sonata facile,8.,9.5,11,60 min,Crime,genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
8,1966,Die Ermittlung,9.,9.4,22,155 min,"Crime, Drama",genres=Crime,This is a theatrical production on a cbs antho...,2018-06-14 11:43:35.535828
9,1981,Ubohý pan Kufalt,10.,9.4,8,185 min,"Comedy, Crime, Drama",genres=Crime,Add a Plot,2018-06-14 11:43:35.535828
