In [1]:
# Python Package imports
import numpy as np
import pandas as pd
import glob
import scipy.misc
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures


In [2]:
# Maximum number of threads that will be spawned
MAX_THREADS = 200

In [3]:
movie_title_arr = []
movie_year_arr = []
movie_genre_arr = []
movie_synopsis_arr =[]
image_url_arr  = []
image_id_arr = []

In [4]:
def getMovieTitle(header):
    try:
        return header[0].find("a").getText()
    except:
        return 'NA'

def getReleaseYear(header):
    try:
        return header[0].find("span",  {"class": "lister-item-year text-muted unbold"}).getText()
    except:
        return 'NA'

def getGenre(muted_text):
    try:
        return muted_text.find("span",  {"class":  "genre"}).getText()
    except:
        return 'NA'

def getsynopsys(movie):
    try:
        return movie.find_all("p", {"class":  "text-muted"})[1].getText()
    except:
        return 'NA'

def getImage(image):
    try:
        return image.get('loadlate')
    except:
        return 'NA'

def getImageId(image):
    try:
        return image.get('data-tconst')
    except:
        return 'NA'

In [5]:
def main(imdb_url):
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "lister-item mode-advanced"})
    
    for movie in movies_list:
        header = movie.find_all("h3", {"class":  "lister-item-header"})
        muted_text = movie.find_all("p", {"class":  "text-muted"})[0]
        imageDiv =  movie.find("div", {"class": "lister-item-image float-left"})
        image = imageDiv.find("img", "loadlate")
        
        #  Movie Title
        movie_title =  getMovieTitle(header)
        movie_title_arr.append(movie_title)
        
        #  Movie release year
        year = getReleaseYear(header)
        movie_year_arr.append(year)
        
        #  Genre  of movie
        genre = getGenre(muted_text)
        movie_genre_arr.append(genre)
        
        # Movie Synopsys
        synopsis = getsynopsys(movie)
        movie_synopsis_arr.append(synopsis)
        
        #  Image attributes
        img_url = getImage(image)
        image_url_arr.append(img_url)
        
        image_id = image.get('data-tconst')
        image_id_arr.append(image_id)

In [6]:
# An array to store all the URL that are being queried
imageArr = []

# Maximum number of pages one wants to iterate over
MAX_PAGE =200

# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    totalRecords = 0 if i==0 else (250*i)+1
    print(totalRecords)
    imdb_url = f'https://www.imdb.com/search/title/?release_date=2020-01-02,2021-02-01&user_rating=4.0,10.0&languages=en&count=250&start={totalRecords}&ref_=adv_nxt'
    imageArr.append(imdb_url)

0
251
501
751
1001
1251
1501
1751
2001
2251
2501
2751
3001
3251
3501
3751
4001
4251
4501
4751
5001
5251
5501
5751
6001
6251
6501
6751
7001
7251
7501
7751
8001
8251
8501
8751
9001
9251
9501
9751
10001
10251
10501
10751
11001
11251
11501
11751
12001
12251
12501


In [7]:
def download_stories(story_urls):
    threads = min(MAX_THREADS, len(story_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, story_urls)

In [8]:
# Call the download function with the array of URLS called imageArr
download_stories(imageArr)

# Attach all the data to the pandas dataframe. You can optionally write it to a CSV file as well
movieDf = pd.DataFrame({
    "Title": movie_title_arr,
    "Release_Year": movie_year_arr,
    "Genre": movie_genre_arr,
    "Synopsis": movie_synopsis_arr,
    "image_url": image_url_arr,
    "image_id": image_id_arr,
})

print('--------- Download Complete CSV Formed --------')

movieDf.to_csv('movies_data_test.csv', index=False)
movieDf.head()

--------- Download Complete CSV Formed --------


Unnamed: 0,Title,Release_Year,Genre,Synopsis,image_url,image_id
0,The Best of WWE: Royal Rumble Matches of the 2...,(2021 Video),\nSport,"\nThe Women of WWE has make a history, Edge wi...",https://m.media-amazon.com/images/M/MV5BMWUwOT...,tt13919658
1,No deixis que m'enfonsi,(2020),\nDrama,\nAdd a Plot\n,https://m.media-amazon.com/images/M/MV5BMjQwNj...,tt11280926
2,Happy Birthday Coley,(2020),"\nShort, Comedy",\nAdd a Plot\n,https://m.media-amazon.com/images/M/MV5BMmZiZW...,tt12239304
3,Janes Addiction Replay 2020 - Virtual Lollapal...,(2021 Video),"\nShort, Music",\nAdd a Plot\n,https://m.media-amazon.com/images/S/sash/NapCx...,tt14101604
4,The Bargain,(2020),"\nShort, Sci-Fi",\nIndentured in the service of Hue - the creat...,https://m.media-amazon.com/images/M/MV5BZTllOD...,tt14372232


In [None]:
movies = pd.read_csv("movies_data_test.csv",encoding="ISO-8859-1")

#Print the label categories we are working with
print(movies.columns)


#Print the total amount of movies in the dataset
print("Total Number of Movies is: " + str(len(movies.index)))

#We will count the number of movies with a specific genre
genrelist = []
length = len(movies)
for n in range(0,length):
    genres = str(movies.loc[n]["Genre"])
    genres = genres.split(",")
    genrelist.extend(genres)
    
#We will also display number of Genres     
unique_genres = list(set(genrelist))
print("Total Number of Genres is: " + str(len(unique_genres)))

In [None]:
path = 'posters/'
data = pd.read_csv("movies_data_test.csv", encoding="ISO-8859-1")


image_glob = glob.glob(path + "/" + "*.jpg")
img_dict = {}


def get_id(filename):
    index_s = filename.rfind("/") + 1
    index_f = filename.rfind(".jpg")
    return filename[index_s:index_f]

for fn in image_glob:
    try:
        img_dict[get_id(fn)] = scipy.misc.imread(fn)
    except:
        pass

def show_img(id):
    title = data[data["image_id"] == int(id)]["Title"].values[0]
    genre = data[data["image_id"] == int(id)]["Genre"].values[0]
    plt.imshow(img_dict[id])
    plt.title("{} \n {}".format(title, genre))

show_img("2")