# Preparing the IMDb dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import json
import requests
import time

api_key1 = "b58bb13f"
api_key2 = "9bf3218a"

RANDOM_STATE = 1212

cmu_data = pd.read_csv("cmu_dataset_v2.csv")
cmu_backups = pd.read_csv("cmu_movies_backup.csv")

##
asd = pd.DataFrame(columns=["Title", "Year", "Genre", "Plot"])
##

In [2]:
# using the cmu_data dataframe to get the movie names and years, make api requests to build a new dataframe with the new movies.
# if {"Response":"False","Error":"Movie not found!"}, the movie was not found and it will not be added to the new dataframe.
# instead, the movie will be added to a df of movies that were not found.
# if the plot summary contains less than 2 sentences, the movie will not be added to the new dataframe.
# use this code to check the length of the plot summary:
# plot_summaries = plot_summaries[plot_summaries["plot"].apply(lambda x: x.count(".") >= 2)]

# loop through the cmu_data dataframe and make api requests to get the movie data



# If no .csv files exist yet, uncomment the following lines and comment the others
# not_found = pd.DataFrame(columns=["movie_name", "year", "genre"])
# imdb_data = pd.DataFrame(columns=["movie_name", "year", "genre", "plot"])

# If the .csv files already exist, run the following lines
not_found = pd.read_csv("not_found_movies.csv")
imdb_data = pd.read_csv("imdb_data_found.csv")




for index, row in cmu_data.iterrows():
    if row["movie_name"] in not_found["movie_name"].values and row["year"] in not_found["year"].values and row["genre"] in not_found["genre"].values:
        continue
    if row["movie_name"] in imdb_data["movie_name"].values and row["year"] in imdb_data["year"].values and row["genre"] in imdb_data["genre"].values:
        continue

    url = f"http://www.omdbapi.com/?t={row['movie_name']}&y={row['year']}&plot=full&apikey={api_key1}"
    response = requests.get(url)
    data = response.json()

    time.sleep(0.5)

    if data["Response"] == "False":
        if data["Error"] == "Request limit reached!":
            print("Request limit reached!")
            break
        else: 
            not_found.loc[len(not_found)] = [row["movie_name"], int(row["year"]), row['genre']]
    else:
        if data["Plot"].count(".") >= 2:
            imdb_data.loc[len(imdb_data)] = [data['Title'], int(data['Year']), row['genre'], data['Plot']]
        else:
            not_found.loc[len(not_found)] = [row["movie_name"], int(row["year"]), row['genre']]


not_found.to_csv("not_found_movies.csv", index=False)
imdb_data.to_csv("imdb_data_found.csv", index=False)
                              



Request limit reached!


In [8]:
#not_found = pd.read_csv("not_found_movies.csv")

#not_found['year'] = not_found['year'].astype(int)
#not_found.to_csv("not_found_movies.csv", index=False)
#not_found.head()

Unnamed: 0,movie_name,year,genre
0,Andy Colby's Incredible Adventure,1988,Comedy
1,Dog Gone,2008,Comedy
2,Tiny Troubles,1939,Comedy
3,The Case of the Mukkinese Battle Horn,1956,Comedy
4,Byl jednou jeden polda,1995,Comedy


In [3]:
#not_found = pd.read_csv("not_found_movies.csv")
#imdb_data = pd.read_csv("imdb_data_found.csv")

# Removing movies that were not found from the CMU dataset and filling it back up

In [4]:
# Merge the dataframes on movie_name and year
merged = pd.merge(cmu_data, not_found, on=['movie_name', 'year', 'genre'], how='left', indicator=True)

# Filter out rows where movie_name and year match
filtered_data = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

# get the genres that were removed
genres = not_found["genre"].unique()

# get the genres that were removed and the number of items in each genre
genre_counts = not_found["genre"].value_counts()

print(genre_counts)




# merged.shape, filtered_data.shape

genre
Comedy    1306
Name: count, dtype: int64


In [5]:
comedy_movies = filtered_data[filtered_data["genre"].str.contains("Comedy")]
horror_movies = filtered_data[filtered_data["genre"].str.contains("Horror")]
thriller_movies = filtered_data[filtered_data["genre"].str.contains("Thriller")]
drama_movies = filtered_data[filtered_data["genre"].str.contains("Drama")]

# add 1500 items to each genre
# if the genre has less than 1500 items, add items from cmu_backups
# if adding a comedy movie, make sure it has no horror, thriller or drama in the genre

filtered_comedy = cmu_backups[cmu_backups['genre'].apply(lambda x: 'Comedy' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Drama']))]
comedy_movies = pd.concat([comedy_movies, filtered_comedy])
comedy_movies = comedy_movies.head(1500)

filtered_horror = cmu_backups[cmu_backups['genre'].apply(lambda x: 'Horror' in x and not any(genre in x for genre in ['Comedy', 'Thriller', 'Drama']))]
horror_movies = pd.concat([horror_movies, filtered_horror])
horror_movies = horror_movies.head(1500)

filtered_thriller = cmu_backups[cmu_backups['genre'].apply(lambda x: 'Thriller' in x and not any(genre in x for genre in ['Comedy', 'Horror', 'Drama']))]
thriller_movies = pd.concat([thriller_movies, filtered_thriller])
thriller_movies = thriller_movies.head(1500)

filtered_drama = cmu_backups[cmu_backups['genre'].apply(lambda x: 'Drama' in x and not any(genre in x for genre in ['Comedy', 'Horror', 'Thriller']))]
drama_movies = pd.concat([drama_movies, filtered_drama])
drama_movies = drama_movies.head(1500)

cmu_dataset = pd.concat([comedy_movies, horror_movies, thriller_movies, drama_movies])
cmu_dataset.reset_index(drop=True, inplace=True)

cmu_backups = cmu_backups[~cmu_backups.isin(cmu_dataset)].dropna()

cmu_dataset.to_csv("cmu_dataset_v2.csv", index=False)
cmu_backups.to_csv("cmu_movies_backup.csv", index=False)

cmu_dataset.shape, cmu_backups.shape, comedy_movies.shape, horror_movies.shape, thriller_movies.shape, drama_movies.shape

((6000, 5), (31375, 5), (1500, 5), (1500, 5), (1500, 5), (1500, 5))