In [None]:
import requests
from bs4 import BeautifulSoup
import random
import pickle
import pandas as pd
import datetime

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}

In [None]:
# skip if the data was pickled
url = "https://www.imdb.com/calendar/?ref_=nv_mv_cal"
html_file = requests.get(url, headers=headers)
html_file = html_file.text

In [None]:
# pickle the data for later use
with open('picled_imdb.ext', 'wb') as pickle_file:
    pickle.dump(html_file.text, pickle_file)


In [None]:
with open('picled_imdb.ext', 'rb') as old_file:
    html_file = pickle.load(old_file)

In [None]:
soup = BeautifulSoup(html_file, 'html.parser')

In [None]:
movies_bunch_list = soup.find_all('article', class_="sc-48add019-1 eovPBi")

<article class="sc-48add019-1 eovPBi" data-testid="calendar-section">
 <div class="ipc-title ipc-title--base ipc-title--title ipc-title--on-textPrimary" data-testid="release-date">
  <hgroup>
   <h3 class="ipc-title__text">
    Aug 18, 2023
   </h3>
  </hgroup>
 </div>
 <ul class="ipc-metadata-list ipc-metadata-list--dividers-after sc-48add019-2 hqwybd ipc-metadata-list--base" role="presentation">
  <li class="ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE" data-testid="coming-soon-entry">
   <div class="ipc-poster ipc-poster--base sc-8c2b7f1f-1 knEssc ipc-sub-grid-item ipc-sub-grid-item--span-2" data-testid="poster" role="group">
    <div class="ipc-media ipc-media--poster-27x40 ipc-image-media-ratio--poster-27x40 ipc-media--base ipc-media--custom ipc-poster__poster-image ipc-media__img" style="width:50px">
     <img alt="Blue Beetle (2023)" class="ipc-image" loading="lazy" sizes="50vw, (min-width: 480px) 34vw, (min-width: 600px) 26vw, (min-w

### Info about the incoming html file
- the release dates of the movies class => ipc-title__text, tag => h3
- movie name class name => ipc-metadata-list-summary-item__t, tag => a
- movie genres




In [None]:
def getGenres(movie):
    genres = []
    genres_actors = movie.find_all('ul', class_='ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base')
    if len(genres_actors) == 0:
        return ""
    genres_list = genres_actors[0].find_all('span', class_='ipc-metadata-list-summary-item__li')
    for genre in genres_list:
        genres.append(genre.text)

    return ", ".join(genres)

In [None]:
#
dataframe_list = []
for movies_list in movies_bunch_list:
    movies_released_date = movies_list.find("h3", class_="ipc-title__text").text
    movies = movies_list.find_all("li", class_="ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE")
    for movie in movies:
        genres = getGenres(movie)
        movie_info = movies_list.find("a", class_="ipc-metadata-list-summary-item__t")
        dataframe_list.append([movie_info.text, movie_info["href"], movies_released_date, genres])


In [None]:
columns = ["movie_name", "link", "release_date", "Genres"]
df = pd.DataFrame(dataframe_list, columns=columns)
df.head()

Unnamed: 0,movie_name,link,release_date,Genres
0,Blue Beetle (2023),/title/tt9362930/?ref_=rlm,"Aug 18, 2023","Action, Adventure, Sci-Fi"
1,Blue Beetle (2023),/title/tt9362930/?ref_=rlm,"Aug 18, 2023","Animation, Adventure, Comedy"
2,Blue Beetle (2023),/title/tt9362930/?ref_=rlm,"Aug 18, 2023","Drama, Horror, Mystery"
3,Blue Beetle (2023),/title/tt9362930/?ref_=rlm,"Aug 18, 2023","Adventure, Drama, Sci-Fi"
4,Blue Beetle (2023),/title/tt9362930/?ref_=rlm,"Aug 18, 2023",Comedy


In [None]:
df['release_date'] = pd.to_datetime(df['release_date'], format="%b %d, %Y")
df.dtypes

movie_name              object
link                    object
release_date    datetime64[ns]
Genres                  object
dtype: object

In [None]:
start_date, end_date = datetime.datetime.strptime("Sep 01, 2023", "%b %d, %Y"), datetime.datetime.strptime("Dec 25, 2023", "%b %d, %Y")
start_date, end_date

(datetime.datetime(2023, 9, 1, 0, 0), datetime.datetime(2023, 12, 25, 0, 0))

In [None]:
filtered_by_date_df = df[(df['release_date'] >= start_date) & (df['release_date']<= end_date)]
number_of_movies = filtered_by_date_df.shape[0]
print(f"The number of movies released in the interval is: {number_of_movies}", end=' ')

The number of movies released in the interval is: 100 

In [None]:
count_movies = filtered_by_date_df['Genres'].str.split(',').explode('Cast').value_counts()

print(f"The most frequent genre amongst the movies in the interval is: {count_movies.index[0]}")

The most frequent genre amongst the movies in the interval is: Drama
