In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def fetch_imdb_data(url):
    # Sending a request to the IMDb URL and get the response
    response = requests.get(url)
    
    # Parseing the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Finding all movie containers on the page
    movie_containers = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Lists to store data for CSV files
    csv1_data = []
    csv2_data = []

    # Loop through the first 100 movie containers
    for index, movie in enumerate(movie_containers[:100], start=1):
        # Extract movie details
        # Serial Number (sno)
        sno = index

        # Movie Name
        movie_name = movie.h3.a.text

        # Duration and Year
        duration_year = movie.find('span', class_='runtime').text
        duration = int(duration_year.split()[0])
        year = int(movie.find('span', class_='lister-item-year').text.strip('()'))

        # Ratings and Metascore
        ratings = float(movie.strong.text)
        metascore = int(movie.find('span', class_='metascore').text.strip()) if movie.find('span', class_='metascore') else None

        # Director and Stars
        credits = movie.find('p', class_='').text.strip().split('|')
        director = credits[0].strip().replace('Director:', '')
        stars = credits[1].strip().replace('Stars:', '')

        # Votes, Genre, Gross Collection, Popularity, and Certification
        votes = int(movie.find('span', attrs={'name': 'nv'})['data-value'].replace(',', ''))
        genre = movie.find('span', class_='genre').text.strip()
        gross = movie.find('p', class_='sort-num_votes-visible').find_all('span')[-1]['data-value']
        popularity_elem = movie.find('span', class_='global-sprite')
        popularity = float(popularity_elem.text) if (popularity_elem and popularity_elem.text.strip()) else None
        certification = movie.find('span', class_='certificate').text.strip() if movie.find('span', class_='certificate') else None

        # Append data to respective lists
        csv1_data.append([sno, movie_name, director, duration, year, ratings, metascore])
        csv2_data.append([movie_name, stars, votes, genre, gross, popularity, certification])

    # Creating DataFrames for CSV files
    csv1_df = pd.DataFrame(csv1_data, columns=['sno', 'Movie Name', 'Director', 'Duration', 'Year', 'Ratings', 'Metascore'])
    csv2_df = pd.DataFrame(csv2_data, columns=['Movie Name', 'Stars', 'Votes', 'Genre', 'Gross Collection', 'Popularity', 'Certification'])

    # Separating Director1 and Director2
    csv1_df[['Director1', 'Director2']] = csv1_df['Director'].str.split(', ', 1, expand=True)

    # Separating Stars into Star1, Star2, Star3, and Star4
    star_split = csv2_df['Stars'].str.split(', ', expand=True)
    csv2_df['Star1'] = star_split[0]
    csv2_df['Star2'] = star_split[1]
    csv2_df['Star3'] = star_split[2]
    csv2_df['Star4'] = star_split[3]

    # Separating Genre1, Genre2, and Genre3
    genre_split = csv2_df['Genre'].str.split(', ', 2, expand=True)
    csv2_df['Genre1'] = genre_split[0]
    csv2_df['Genre2'] = genre_split[1]
    csv2_df['Genre3'] = genre_split[2]

    # Droping unnecessary columns
    csv1_df.drop(columns=['Director'], inplace=True)
    csv2_df.drop(columns=['Stars', 'Genre'], inplace=True)

    return csv1_df, csv2_df

def save_to_csv(df, file_name):
    df.to_csv(file_name, index=False)

if __name__ == "__main__":
    # Fetching data from IMDb website
    url = 'https://www.imdb.com/search/title/?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=f11158cc-b50b-4c4d-b0a2-40b32863395b&pf_rd_r=XZ8X52H1R40B7KG5SNZ9&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1'
    csv1_df, csv2_df = fetch_imdb_data(url)

    # Saving the DataFrames to CSV files
    save_to_csv(csv1_df, 'IMDbMoviesInfo.csv')
    save_to_csv(csv2_df, 'IMDbMoviesDetails.csv')

    # Printing a message to confirm the CSV files are created
    print("CSV files 'IMDbMoviesInfo.csv' and 'IMDbMoviesDetails.csv' have been created.")

    # Read the data from the new CSV files
    csv1_df = pd.read_csv('IMDbMoviesInfo.csv')
    csv2_df = pd.read_csv('IMDbMoviesDetails.csv')

CSV files 'IMDbMoviesInfo.csv' and 'IMDbMoviesDetails.csv' have been created.


In [3]:
# Print DataFrames (if needed)
print("\nContents of IMDbMoviesInfo.csv:")
print(csv1_df)
print("\nContents of IMDbMoviesDetails.csv:")
print(csv2_df)


Contents of IMDbMoviesInfo.csv:
    sno                                         Movie Name  Duration  Year  \
0     1                                    The Dark Knight       152  2008   
1     2      The Lord of the Rings: The Return of the King       201  2003   
2     3                Spider-Man: Across the Spider-Verse       140  2023   
3     4                                          Inception       148  2010   
4     5  The Lord of the Rings: The Fellowship of the Ring       178  2001   
5     6              The Lord of the Rings: The Two Towers       179  2002   
6     7                                         The Matrix       136  1999   
7     8     Star Wars: Episode V - The Empire Strikes Back       124  1980   
8     9                                    Soorarai Pottru       153  2020   
9    10                         Terminator 2: Judgment Day       137  1991   
10   11                                          Star Wars       121  1977   
11   12                        