# Project Overlord: Movie Predictor
### This project is dedicated to the legendary action hero Steven Seagal.
![title](steven.jpg)
## What does it do?
### Predict
- Average rating of movies on IMDb

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

headers = [
        "title",
        "rating",
        "Year",
        "runtime",
        "Mpa",
        "raters",
        "budget",
        #"gross USA Ca gross_US_CA,
        "gross",
        "color",
        "meta score",
        "Director",
        "Writer",
        "main Actor",
        "second Actor",
        "genre",
    ]


In [3]:
start = 1
links = []

for i in range(start):
    url = "https://www.imdb.com/search/title/?groups=top_1000&view=simple&sort=user_rating,desc&count=100&start={start}&ref_=adv_nxt"
    start = start + 100
    page = requests.get(url)
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page)
    search1 = soup.find_all(class_="lister-item-index unbold text-primary")
    link_list = [i.findNext().findChildren()[0]["href"] for i in search1]
    links.extend(link_list)
    start += 100

In [4]:
class IMDBScraper:


   def __init__(self, url):

       self.url = url

       self.download_page()

   def download_page(self):

       # method for downloading the hotel page

       self.page = requests.get(self.url).text

   def scrape_data(self):

       #method for scraping out movie title and description

       soup = BeautifulSoup(self.page, "html.parser")
    #Title
       movie_title = soup.find("h1", {"data-testid": "hero-title-block__title"}).text
    #Rating
       rating = float(soup.find('div', {"data-testid": "hero-rating-bar__aggregate-rating__score"}).text.strip().replace("/10",""))
    #Raters
       raters = soup.find('div', {"data-testid": "hero-rating-bar__aggregate-rating"}).text.split("/")[1]

       if len(raters.replace("M", "")) < len(raters):
        raters = int(float(raters.replace("M", ""))*1000000)
       elif len(raters.replace("K", "")) < len(raters):
        raters = int(float(raters.replace("K", ""))*1000)
    #Gross US and Canada
       try:
        gross_US_CA = int(soup.find('li', {"data-testid": "title-boxoffice-grossdomestic"}).text.strip().replace("$", "").replace(",", "").replace("Gross US & Canada", ""))
       except:
        gross_US_CA = np.nan
    #Gross World wide
       try:
        gross = int(soup.find('li', {"data-testid": "title-boxoffice-cumulativeworldwidegross"}).text.strip().replace("$", "").replace(",", "").replace("Gross worldwide", ""))
       except:
        gross = np.nan
       
    #Release year, Mpa
       yearMpa = soup.find('ul', {"data-testid": "hero-title-block__metadata"}).text
       
       releaseYear = int(yearMpa[0:4:1])

       if len(yearMpa) == 14: #check if Mpa info is avilable
        Mpa = np.nan
       elif yearMpa[9] == 'R': #check if movie is rated R
        Mpa = 18
       elif yearMpa[8] == 'P': #check if movie is rated PG\n",
        Mpa = 13
       elif len(yearMpa) == 16: #check if Mpa is than age 10\n",
        Mpa = yearMpa[9]
       else:
        Mpa = yearMpa[10:12:1]

    #runtime
       runtime = soup.find('li', {"data-testid":"title-techspec_runtime"}).text.replace("Runtime", "").replace("hours ", "").replace("hour ", "").replace("hours", "").replace(" minutes", "")
       
       hoursMinutes = runtime.split(" ")
       try:
        minutes = int(hoursMinutes[0])*60 + int(hoursMinutes[1])
       except:
        minutes = np.nan
        print(movie_title, " didn't have runtime") 
        
    #Budget
       try:
           budget = int(soup.find('li', {"data-testid": "title-boxoffice-budget"}).text.strip().replace("$", "").replace("(estimated)", "").replace("Budget", "").replace(",", ""))

       except:
           budget = np.nan
    #Color
       try:
        color = soup.find('li', {"data-testid": "title-techspec_color"}).text.strip()
       except:
        color = "Color"
    #Meta Score
       try:
          meta_score = int(soup.find('span', {"class" : "score-meta"}).text.strip())
       except:
          meta_score = np.nan
   
   #Genre
       genre = soup.find("div", {"data-testid": "genres"}).find("span", class_ = "ipc-chip__text").text
       
   #Director
       director = soup.find('section', {"data-testid" : "title-cast"}).find('a', class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link").text

      
   #Writer
       try:
        writer = soup.find('section', {"data-testid" : "title-cast"}).find('a', class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link").find_next().text.strip().replace("Writers", "").replace("Writer", "").split("(")[0]
       except:
        writer = soup.find('section', {"data-testid" : "title-cast"}).find('a', class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link").find_next().text.strip().replace("Writers", "").replace("Writer", "")
   #1st Actor
       firstActor = soup.find('a', {"data-testid" : "title-cast-item__actor"}).text

   #2st Actor
       secondActor = soup.find('div', {"data-testid" : "title-cast-item"}).find_next('div', {"data-testid" : "title-cast-item"}).find('a', {"data-testid" : "title-cast-item__actor"}).text
    
   
       return {"title": movie_title,

               "rating" : rating,

               "Year" : releaseYear,

               "runtime" : minutes,

               "Mpa" : Mpa,

               "raters" : raters,

               "budget" : budget,

               #"gross USA Canada" : gross_US_CA,

               "gross" : gross,

               "color" : color,

               "meta score" : meta_score,

               "Director" : director,

               "Writer" : writer,

               "main Actor" : firstActor,

               "second Actor" : secondActor,

               "genre" : genre,

               }


In [5]:

movieInfo = pd.DataFrame(columns=headers)
counter = 0

for url in links:

   url = "https://www.imdb.com" + url
   x = IMDBScraper(url)

   df = pd.DataFrame([x.scrape_data()], columns=headers)

   movieInfo = pd.concat([movieInfo,df[headers]], join='outer', ignore_index=True)
   #print(df)


Old Boy - Hämnden  didn't have runtime


In [6]:
movieInfo.dropna( inplace=True)
movieInfo.reset_index(inplace=True,drop=True)
movieInfo.to_csv("../movieInfoUprising.tsv", sep="\t")

#### Now the data is collected and ready to be transformed

In [7]:
# Read the collected data
movieInfo = pd.read_csv('../movieInfoUprising.tsv', delimiter='\t', usecols=headers)

In [10]:
cols = ["Director", "Writer", "main Actor", "second Actor", "genre"]

directorDummies = pd.get_dummies(movieInfo["Director"])
writerDummies = pd.get_dummies(movieInfo["Writer"])
mainActorDummies = pd.get_dummies(movieInfo["main Actor"])
secondActorDummies = pd.get_dummies(movieInfo["second Actor"])
genreDummies = pd.get_dummies(movieInfo["genre"])


    Alfred Hitchcock  Andrew Stanton  Anthony Russo  Asghar Farhadi  \
0                  0               0              0               0   
1                  0               0              0               0   
2                  0               0              0               0   
3                  0               0              0               0   
4                  0               0              0               0   
..               ...             ...            ...             ...   
62                 0               0              0               0   
63                 0               0              0               0   
64                 0               0              0               0   
65                 0               0              0               1   
66                 0               0              0               0   

    Bob Persichetti  Bong Joon Ho  Bryan Singer  Charles Chaplin  \
0                 0             0             0                0   
1          