In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import re

In [None]:
# Datasets:

# Netflix Prize Dataset 
# https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data?resource=download

# Large Movie Review Dataset
# https://ai.stanford.edu/~amaas/data/sentiment/

# Web data: Amazon movie reviews
# https://snap.stanford.edu/data/web-Movies.html

# MovieLens
# https://grouplens.org/datasets/movielens/

# IMdB
# https://datasets.imdbws.com/

# Metacritic

In [37]:
# CMU corpus
df_cmu = pd.read_csv('./MovieSummaries/MovieSummaries/movie.metadata.tsv', sep='\t', header=None,
               names=['Wikipedia_ID', 'Freebase_ID', 'Movie_name', 'Release_date', 'Box_office', 'Runtime',
                     'Movie_languages', 'Movie_countries', 'Movie_genres'])

In [38]:
def extract_dictionnaries_from_string(country_string):
    if isinstance(country_string, str) and country_string != '{}':
        # Use regex to find all country names (text inside double quotes after the colon)
        return ', '.join(re.findall(r'": "([^"]+)"', country_string))  # Extract country names after ": "
    return np.nan  # Return NaN for empty strings or {}

df_cmu['Cleaned_countries'] = df_cmu['Movie_countries'].apply(extract_dictionnaries_from_string)
df_cmu['Cleaned_genres'] = df_cmu['Movie_genres'].apply(extract_dictionnaries_from_string)
df_cmu['Cleaned_languages'] = df_cmu['Movie_languages'].apply(extract_dictionnaries_from_string)
df_cmu['Release_date'] = pd.to_datetime(df_cmu['Release_date'], errors='coerce')


df_cmu.drop(['Movie_countries'], axis=1, inplace=True)
df_cmu.drop(['Movie_genres'], axis=1, inplace=True)
df_cmu.drop(['Movie_languages'], axis=1, inplace=True)
df_cmu.head()

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Cleaned_countries,Cleaned_genres,Cleaned_languages
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,United States of America,"Thriller, Science Fiction, Horror, Adventure, ...",English Language
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,United States of America,"Mystery, Biographical film, Drama, Crime Drama",English Language
2,28463795,/m/0crgdbh,Brun bitter,NaT,,83.0,Norway,"Crime Fiction, Drama",Norwegian Language
3,9363483,/m/0285_cd,White Of The Eye,NaT,,110.0,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",English Language
4,261236,/m/01mrr1,A Woman in Flames,NaT,,106.0,Germany,Drama,German Language


In [39]:
# Movieslens 
df_movielens = pd.read_csv('Movieslens/ml-32m/movies.csv')

In [55]:
# TMdB
df_tmdb = pd.read_csv('TMDB_movie_dataset_v11.csv')
df_tmdb['release_date'] = pd.to_datetime(df_tmdb['release_date'], errors='coerce')


In [41]:
# For movielens 
def remove_date(name):
    # Remove anything in parentheses, including the space before it
    name = re.sub(r'\s*\(.*?\)', '', name)
    return name

In [42]:
df_movielens['cleaned_title'] = df_movielens['title'].apply(remove_date)

In [43]:
def normalize_movie_name(name):
    # Check if the input is not a string
    name = str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove punctuation
    name = re.sub(r'[^\w\s]', '', name)
    # Replace multiple spaces with a single space
    name = re.sub(r'\s+', ' ', name)
    # Strip leading/trailing spaces
    name = name.strip()
    
    return name

In [56]:
df_cmu['normalized_name'] = df_cmu['Movie_name'].apply(normalize_movie_name)
df_movielens['normalized_name'] = df_movielens['cleaned_title'].apply(normalize_movie_name)
df_tmdb['normalized_name'] = df_tmdb['title'].apply(normalize_movie_name)

In [57]:
df_cmu.shape, df_movielens.shape, df_tmdb.shape

((81741, 10), (87585, 5), (1129227, 25))

In [128]:
common_movies_cmu_movielens = pd.merge(df_cmu, df_movielens, on='normalized_name', how='inner')
commom_movies_cmu_tmdb = pd.merge(df_cmu, df_tmdb, on='normalized_name', how='inner')

common_movies_cmu_movielens = common_movies_cmu_movielens.drop_duplicates(subset='normalized_name')
commom_movies_cmu_tmdb = commom_movies_cmu_tmdb.drop_duplicates(subset='normalized_name')

common_movies_cmu_movielens.shape, commom_movies_cmu_tmdb.shape

((27793, 14), (59359, 34))

In [59]:
common_movies_all_together = pd.merge(common_movies_cmu_movielens, commom_movies_cmu_tmdb, on='normalized_name', how='inner')
common_movies_all_together = common_movies_all_together.drop_duplicates(subset='normalized_name')
common_movies_all_together.shape

(27122, 47)

## Finding missing values of original dataset

#### Cleaning the runtime

In [60]:
commom_movies_cmu_tmdb.columns

Index(['Wikipedia_ID', 'Freebase_ID', 'Movie_name', 'Release_date',
       'Box_office', 'Runtime', 'Cleaned_countries', 'Cleaned_genres',
       'Cleaned_languages', 'normalized_name', 'id', 'title', 'vote_average',
       'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult',
       'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path', 'tagline',
       'genres', 'production_companies', 'production_countries',
       'spoken_languages', 'keywords'],
      dtype='object')

In [129]:
commom_movies_cmu_tmdb_cleaned = commom_movies_cmu_tmdb.drop(columns=['id', 'title', 'vote_average', 'vote_count', 'status',
                                                                     'adult', 'backdrop_path', 'homepage', 'original_language', 
                                                                      'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 
                                                                      'genres', 'production_companies', 'production_countries', 'spoken_languages', 'Freebase_ID',
                                                                     'Wikipedia_ID'])

commom_movies_cmu_tmdb_cleaned['Release_date'] = commom_movies_cmu_tmdb_cleaned['Release_date'].dt.year
commom_movies_cmu_tmdb_cleaned['release_date'] = commom_movies_cmu_tmdb_cleaned['release_date'].dt.year

In [130]:
commom_movies_cmu_tmdb_cleaned.head()

Unnamed: 0,Movie_name,Release_date,Box_office,Runtime,Cleaned_countries,Cleaned_genres,Cleaned_languages,normalized_name,release_date,revenue,runtime,budget,imdb_id,keywords
0,Ghosts of Mars,2001.0,14010832.0,98.0,United States of America,"Thriller, Science Fiction, Horror, Adventure, ...",English Language,ghosts of mars,2001.0,14010832,98,28000000,tt0228333,"future, planet mars, anti hero, possession, ho..."
1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,,95.0,United States of America,"Mystery, Biographical film, Drama, Crime Drama",English Language,getting away with murder the jonbenét ramsey m...,2000.0,0,60,0,tt0245916,"colorado, jonbenet"
2,White Of The Eye,,,110.0,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",English Language,white of the eye,1987.0,0,111,0,tt0094320,"based on novel or book, gas station, psychopat..."
3,A Woman in Flames,,,106.0,Germany,Drama,German Language,a woman in flames,1983.0,0,106,0,tt0083949,"jealousy, eroticism, gigolo, longing, dominatr..."
4,The Gangsters,1913.0,,35.0,United States of America,"Short Film, Silent film, Indie, Black-and-whit...","Silent film, English Language",the gangsters,1913.0,0,35,0,tt0002894,"slapstick comedy, keystone kops"


In [131]:
# Check if each pair of columns are equal and store the results
# Step 1: Replace NaN in 'Release_date', 'Box_office', 'Runtime' with corresponding 'release_date', 'revenue', 'runtime'
commom_movies_cmu_tmdb_cleaned['Release_date'] = commom_movies_cmu_tmdb_cleaned['Release_date'].combine_first(commom_movies_cmu_tmdb_cleaned['release_date'])
commom_movies_cmu_tmdb_cleaned['Box_office'] = commom_movies_cmu_tmdb_cleaned['Box_office'].combine_first(commom_movies_cmu_tmdb_cleaned['revenue'])
commom_movies_cmu_tmdb_cleaned['Runtime'] = commom_movies_cmu_tmdb_cleaned['Runtime'].combine_first(commom_movies_cmu_tmdb_cleaned['runtime'])

# Step 2: Check for equality
commom_movies_cmu_tmdb_cleaned['check_release_date'] = commom_movies_cmu_tmdb_cleaned['release_date'] == commom_movies_cmu_tmdb_cleaned['Release_date']
commom_movies_cmu_tmdb_cleaned['check_revenue'] = commom_movies_cmu_tmdb_cleaned['revenue'] == commom_movies_cmu_tmdb_cleaned['Box_office']
commom_movies_cmu_tmdb_cleaned['check_runtime'] = commom_movies_cmu_tmdb_cleaned['runtime'] == commom_movies_cmu_tmdb_cleaned['Runtime']

# Optional: Check if all columns match for each row
commom_movies_cmu_tmdb_cleaned['all_match'] = commom_movies_cmu_tmdb_cleaned[['check_release_date', 'check_revenue', 'check_runtime']].all(axis=1)

print(commom_movies_cmu_tmdb_cleaned[['check_release_date', 'check_revenue', 'check_runtime', 'all_match']])

        check_release_date  check_revenue  check_runtime  all_match
0                     True           True           True       True
1                     True           True          False      False
2                     True           True          False      False
3                     True           True           True       True
4                     True           True           True       True
...                    ...            ...            ...        ...
218023                True           True           True       True
218024                True          False           True      False
218044                True           True          False      False
218045                True           True          False      False
218047                True           True           True       True

[59359 rows x 4 columns]


In [79]:
(commom_movies_cmu_tmdb_cleaned['check_release_date'] == False).sum()

np.int64(5702)

In [80]:
(commom_movies_cmu_tmdb_cleaned['check_revenue'] == False).sum()

np.int64(4624)

In [81]:
(commom_movies_cmu_tmdb_cleaned['check_runtime'] == False).sum()

np.int64(23021)

In [116]:
sum_of_false = (commom_movies_cmu_tmdb_cleaned['all_match'] == False).sum()

In [108]:
for i, match in enumerate(commom_movies_cmu_tmdb_cleaned['all_match'][:200]):
    if match == False:
        print(commom_movies_cmu_tmdb_cleaned.iloc[i]['Movie_name'])
        print("release date ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Release_date'], commom_movies_cmu_tmdb_cleaned.iloc[i]['release_date'])
        print("box office ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Box_office'], commom_movies_cmu_tmdb_cleaned.iloc[i]['revenue'])
        print("runtime ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Runtime'], commom_movies_cmu_tmdb_cleaned.iloc[i]['runtime'])
        movie_data = get_movie_data_by_id(commom_movies_cmu_tmdb_cleaned.iloc[i]['imdb_id'])
        print(movie_data)
        print("---------------")

Getting Away with Murder: The JonBenét Ramsey Mystery
release date  2000.0 2000.0
box office  0.0 0
runtime  95.0 60
{'title': 'Getting Away with Murder: The JonBenet Ramsey Mystery', 'runtime': ['95'], 'box_office': None, 'release_date': '2000'}
---------------
White Of The Eye
release date  1987.0 1987.0
box office  0.0 0
runtime  110.0 111
{'title': 'White of the Eye', 'runtime': ['110'], 'box_office': None, 'release_date': '1987'}
---------------
The Sorcerer's Apprentice
release date  2010.0 2010.0
box office  215283742.0 215283742
runtime  86.0 109
{'title': "The Sorcerer's Apprentice", 'runtime': ['109'], 'box_office': '$215,283,742', 'release_date': '2010'}
---------------
Alexander's Ragtime Band
release date  1938.0 1938.0
box office  3600000.0 4000000
runtime  106.0 106
{'title': "Alexander's Ragtime Band", 'runtime': ['106'], 'box_office': None, 'release_date': '1938'}
---------------
City of the Dead
release date  2019.0 2019.0
box office  0.0 0
runtime  76.0 57
None
-----

In [132]:
import numpy as np
import pandas as pd
from imdb import IMDb
from tqdm import tqdm

# Initialize IMDb instance
ia = IMDb()

# List to store problematic indices for later manual treatment
problematic_indices = []

# Function to get movie data using IMDb ID
def get_movie_data_by_id(imdb_id):
    # Check if imdb_id is a valid string (not NaN or empty)
    if pd.isna(imdb_id) or not isinstance(imdb_id, str) or imdb_id.strip() == "":
        return None  # Skip invalid IMDb IDs
    
    # Ensure imdb_id is a string and remove 'tt' prefix if present
    imdb_id = str(imdb_id).replace("tt", "")
    
    # Get movie details using the IMDb ID
    try:
        movie = ia.get_movie(imdb_id)
    except IMDb.IMDbError as e:
        print(f"Error fetching movie data for IMDb ID {imdb_id}: {e}")
        return None
    
    # Extract title, runtime, box office, and release date
    title = movie.get('title')
    runtime = movie.get('runtime')  # Runtime in minutes
    box_office = movie.get('box office', {}).get('Cumulative Worldwide Gross')
    
    # Try to get the release date
    release_date = movie.get('year')
    release_date = str(release_date) if release_date else None

    # Return the movie data
    return {
        'title': title,
        'runtime': runtime if runtime else "Data unavailable",
        'box_office': box_office if box_office else "Data unavailable",
        'release_date': release_date if release_date else "Data unavailable"
    }

# Iterate over the movies in your DataFrame
# for i, match in tqdm(enumerate(commom_movies_cmu_tmdb_cleaned['all_match'][:200]), total=200, ):
for i, match in enumerate(commom_movies_cmu_tmdb_cleaned['all_match']):
    if not match:  # If 'all_match' is False
        # Get the values from the relevant columns
        release_date_cmu = commom_movies_cmu_tmdb_cleaned.iloc[i]['Release_date']
        release_date_tmdb = commom_movies_cmu_tmdb_cleaned.iloc[i]['release_date']
        box_office_cmu = commom_movies_cmu_tmdb_cleaned.iloc[i]['Box_office']
        box_office_tmdb = commom_movies_cmu_tmdb_cleaned.iloc[i]['revenue']
        runtime_cmu = commom_movies_cmu_tmdb_cleaned.iloc[i]['Runtime']
        runtime_tmdb = commom_movies_cmu_tmdb_cleaned.iloc[i]['runtime']
        
        # Print the current movie name
        print(f"Movie number: {i}/{sum_of_false}")
        print(commom_movies_cmu_tmdb_cleaned.iloc[i]['Movie_name'])
        
        # Check and print release date
        if release_date_cmu != release_date_tmdb:
            print("release date mismatch:", release_date_cmu, release_date_tmdb)
            movie_data = get_movie_data_by_id(commom_movies_cmu_tmdb_cleaned.iloc[i]['imdb_id'])
            if movie_data:
                print(f"Updated release date from IMDb: {movie_data['release_date']}")
                release_date_imdb = movie_data['release_date']
                commom_movies_cmu_tmdb_cleaned.loc[i, 'Release_date'] = str(release_date_imdb)
                print("Updated release date: ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Release_date'])
            else:
                problematic_indices.append(i)
        
        # Check and print box office
        if box_office_cmu != box_office_tmdb:
            print("box office mismatch:", box_office_cmu, box_office_tmdb)
            movie_data = get_movie_data_by_id(commom_movies_cmu_tmdb_cleaned.iloc[i]['imdb_id'])
            if movie_data:
                print(f"Updated box office from IMDb: {movie_data['box_office']}")
                box_office_imdb = movie_data['box_office']
                commom_movies_cmu_tmdb_cleaned.loc[i, 'Box_office'] = float(box_office_imdb)
                print("Updated box office: ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Box_office'])
            else:
                problematic_indices.append(i)
        
        # Check and print runtime
        if runtime_cmu != runtime_tmdb:
            print("runtime mismatch:", runtime_cmu, runtime_tmdb)
            movie_data = get_movie_data_by_id(commom_movies_cmu_tmdb_cleaned.iloc[i]['imdb_id'])
            if movie_data:
                print(f"Updated runtime from IMDb: {movie_data['runtime']}")
                runtime_imdb = movie_data['runtime']
                commom_movies_cmu_tmdb_cleaned.loc[i, 'Runtime'] = float(runtime_imdb[0])
                print("Updated runtime: ", commom_movies_cmu_tmdb_cleaned.iloc[i]['Runtime'])
            else:
                problematic_indices.append(i)
        
        # Print final updated data
        
        
        
        print("---------------")

# After processing, print problematic indices for manual review
if problematic_indices:
    print("\nList of problematic indices that need manual review:")
    print(np.array(problematic_indices))
else:
    print("\nNo problematic indices found.")
commom_movies_cmu_tmdb_cleaned.to_csv('movies_data_cleaned.csv', index=False)

Movie number: 1/27397
Getting Away with Murder: The JonBenét Ramsey Mystery
runtime mismatch: 95.0 60
Updated runtime from IMDb: ['95']
Updated runtime:  95.0
---------------
Movie number: 2/27397
White Of The Eye
runtime mismatch: 110.0 111
Updated runtime from IMDb: ['110']
Updated runtime:  110.0
---------------
Movie number: 5/27397
The Sorcerer's Apprentice
runtime mismatch: 86.0 109
Updated runtime from IMDb: ['109']
Updated runtime:  86.0
---------------
Movie number: 6/27397
Alexander's Ragtime Band
box office mismatch: 3600000.0 4000000.0
Updated box office from IMDb: Data unavailable


ValueError: could not convert string to float: 'Data unavailable'

In [121]:
from imdb import IMDb

# Initialize IMDb instance
ia = IMDb()

# Function to get movie data
def get_movie_data(title):
    # Search for the movie title
    movies = ia.search_movie(title)
    
    if movies:
        # Get the first movie result
        movie_id = movies[0].movieID
        movie = ia.get_movie(movie_id)
        
        # Get the title, runtime, and box office
        title = movie.get('title')
        runtime = movie.get('runtime')  # Runtime in minutes
        box_office = movie.get('box office', {}).get('Cumulative Worldwide Gross')
        release_date = movie.get('release dates', [])
        
        return {
            'title': title,
            'runtime': runtime,
            'box_office': box_office
        }
    else:
        return None

def get_movie_data_by_id(imdb_id):
    # Remove 'tt' prefix if present
    if pd.isna(imdb_id) or not isinstance(imdb_id, str) or imdb_id.strip() == "":
        return None  # Skip invalid IMDb IDs
    imdb_id = str(imdb_id).replace("tt", "")
    
    # Get movie details using the IMDb ID
    movie = ia.get_movie(imdb_id)
    
    # Extract title, runtime, box office, and release date
    title = movie.get('title')
    runtime = movie.get('runtime')  # Runtime in minutes
    box_office = movie.get('box office', {}).get('Cumulative Worldwide Gross')
    release_date = movie.get('release dates', [])
    
    # Extract the first release date (if available)
    release_date = movie.get('year')
    release_date = str(release_date) if release_date else None
    
    return {
        'title': title,
        'runtime': runtime,
        'box_office': box_office,
        'release_date': release_date
    }

# Example usage
movie_data = get_movie_data("Inception")
print(movie_data)

{'title': 'Inception', 'runtime': ['148'], 'box_office': '$825,532,764, 06 Jan 2011'}


In [127]:
str(movie_data['title'])

'I'

In [20]:
nan_rows = commom_movies_cmu_tmdb[commom_movies_cmu_tmdb['Release_date'].isna() & commom_movies_cmu_tmdb['release_date'].isna()]

In [21]:
nan_rows

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Movie_languages,Movie_countries,Movie_genres,normalized_name,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
189,28415406,/m/0crj1f3,The Last Trackers of the Outback,,,,"{""/m/02h40lc"": ""English Language""}","{""/m/0f8l9c"": ""France"", ""/m/0chghy"": ""Australia""}","{""/m/0jtdp"": ""Documentary""}",the last trackers of the outback,...,The Last Trackers of the Outback,"For millenniums, Aborigines used tracking to s...",0.600,,,,,,,
421,32972053,/m/0h541q3,Broken Chains,,,,{},{},"{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D...",broken chains,...,Broken Chains,An unflinching exposé on the racial wealth gap...,0.600,/c2eQ20dPmtJvhYpANcxwHmiBRJD.jpg,,Documentary,,,English,
1034,1393873,/m/04z2kf,Loopy De Loop,,,,{},{},"{""/m/05p553"": ""Comedy film"", ""/m/0hqxf"": ""Fami...",loopy de loop,...,Loopy De Loop,,0.000,,,,,,,
4100,6685838,/m/0gh6t8,Engagement,,,,{},"{""/m/0hzlz"": ""South Africa""}","{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D...",engagement,...,Engagement,,0.000,,,,,,,
4661,20904652,/m/05b4w4s,Prem Vivah,,,150.0,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/02l7c8"": ""Romance Film""}",prem vivah,...,Prem Vivah,,0.000,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210014,32501689,/m/0gmf5cb,The Return of Billy Jack,,,,{},{},"{""/m/02kdv5l"": ""Action""}",the return of billy jack,...,The Return of Billy Jack,Billy Jack returns to bring down a child porno...,1.728,/xOVgUMTlJ84jGd4uP6r1KXE8c6.jpg,,"Drama, Action, Crime",Billy Jack Enterprises,United States of America,English,"child pornography, mafia"
211337,28372388,/m/0crj3bg,Amar Praner Priya,,,160.0,{},{},"{""/m/06cvj"": ""Romantic comedy""}",amar praner priya,...,Amar Praner Priya,Prem is helpful to his friends and solves thei...,0.000,,,,,,,
211653,21901669,/m/05nztkz,Shoebite,,,,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/02l7c8"": ""Romance Film""}",shoebite,...,Shoebite,Shoebite is the story of a man in his early 60...,1.400,/zFyzTICg0EpVy8a5v5pUcF1ZzpE.jpg,,Drama,UTV Motion Pictures,India,Hindi,
215029,22963595,/m/063ych2,Chaitra,,,20.0,"{""/m/055qm"": ""Marathi Language"", ""/m/02h40lc"":...","{""/m/03rk0"": ""India""}","{""/m/02hmvc"": ""Short Film""}",chaitra,...,சைத்ரா,"Chaitra, Kathir’s mentally unstable housewife,...",0.600,/mL2V6glU3pMGR9jHt6VJUPIb1RJ.jpg,The Beginning of the End,"Horror, Thriller",mars productions (in),,Tamil,


In [140]:
movies_name = df_cmu['Movie_name'].unique()
len(movies_name)

75478

In [7]:
#df_netflix = pd.read_csv('./Netflix/movie_titles.csv', encoding="ISO-8859-1", header=None, names=['movie_id', 'year_of_release', 'title'],
#                         parse_dates=['year_of_release'],on_bad_lines='skip')

In [13]:
#movies_name_netflix = df_netflix['title'].unique()
#len(movies_name_netflix)

"""
import pandas as pd

# Assuming your DataFrame is named df and the movie ID column is named 'movie_id'
all_ids = set(range(1, 17771))  # Full range of IDs from 1 to 17770
existing_ids = set(df['movie_id'])  # Existing IDs in the DataFrame

# Find missing IDs
missing_ids = sorted(all_ids - existing_ids)

print("Missing IDs:", missing_ids)
"""

17026

In [142]:
movielens.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [146]:
movielens.head()

Unnamed: 0,movieId,title,genres,cleaned_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II


In [56]:
#df_imdb = pd.read_csv('IMdB/title.basics.tsv', sep='\t', encoding='utf-8', na_values=['\n'])

  df_imdb = pd.read_csv(


In [78]:
#a = pd.DataFrame(df_imdb['originalTitle'].unique(), columns=['originalTitle'])
#a.head()

Unnamed: 0,originalTitle
0,Carmencita
1,Le clown et ses chiens
2,Pauvre Pierrot
3,Un bon bock
4,Blacksmith Scene


In [75]:
#a.shape

(5060993, 1)

In [147]:


common_movies_cmu_movielens = pd.merge(common_movies_cmu_plot_summaries, movielens, on='normalized_name', how='inner')
#common_movies_cmu_netflix = pd.merge(common_movies_cmu_plot_summaries, df_netflix, on='normalized_name', how='inner')
#commom_movies_cmu_imdb = pd.merge(common_movies_cmu_plot_summaries, df_imdb, on='normalized_name', how='inner')
commom_movies_cmu_tmdb = pd.merge(common_movies_cmu_plot_summaries, df_tmdb, on='normalized_name', how='inner')


common_movies_cmu_movielens = common_movies_cmu_movielens.drop_duplicates(subset='normalized_name')
#common_movies_cmu_netflix = common_movies_cmu_netflix.drop_duplicates(subset='normalized_name')
#commom_movies_cmu_imdb = commom_movies_cmu_imdb.drop_duplicates(subset='normalized_name')
commom_movies_cmu_tmdb = commom_movies_cmu_tmdb.drop_duplicates(subset='normalized_name')

common_movies_cmu_movielens2 = pd.merge(df_cmu, movielens, on='normalized_name', how='inner')
#common_movies_cmu_netflix2 = pd.merge(df_cmu, df_netflix, on='normalized_name', how='inner')
#commom_movies_cmu_imdb2 = pd.merge(df_cmu, df_imdb, on='normalized_name', how='inner')
commom_movies_cmu_tmdb2 = pd.merge(df_cmu, df_tmdb, on='normalized_name', how='inner')


common_movies_cmu_movielens2 = common_movies_cmu_movielens2.drop_duplicates(subset='normalized_name')
#common_movies_cmu_netflix2 = common_movies_cmu_netflix2.drop_duplicates(subset='normalized_name')
#commom_movies_cmu_imdb2 = commom_movies_cmu_imdb2.drop_duplicates(subset='normalized_name')
commom_movies_cmu_tmdb2 = commom_movies_cmu_tmdb2.drop_duplicates(subset='normalized_name')

In [149]:
df_cmu.shape, df_netflix.shape, movielens.shape, df_imdb.shape, common_movies_cmu_plot_summaries.shape

((81741, 10), (17434, 4), (87585, 5), (11219540, 10), (42204, 11))

In [155]:
df_tmdb

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.710,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129222,729506,Love Lives,0.000,0,Released,,0,14,False,,...,Love Lives,"Filmed over several days in Sheffield, UK, thi...",0.600,,,,,,,
1129223,729508,Close your eyes to see better,0.000,0,Released,1986-01-01,0,23,False,,...,Die Augen schließen um besser zu sehen,This video was made in 1986 by residents of Ha...,0.600,,,,,,German,
1129224,729509,Men Who Don't Work,0.000,0,Released,2011-09-29,0,12,False,,...,Men Who Don't Work,When an unusual family moves into a house on h...,0.600,/eTVp4OyCrbNkKBsKz30Dtc7sVdG.jpg,,Drama,,United States of America,English,
1129225,729510,Torture in Stammheim prison,0.000,0,Released,2005-01-01,0,45,False,,...,Folter in Stammheim? Die Propaganda der RAF,When a law enforcement officer unravels the ce...,0.600,,,,,,German,


In [151]:
commom_movies_cmu_tmdb.head()

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Movie_languages,Movie_countries,Movie_genres,normalized_name,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",ghosts of mars,...,Ghosts of Mars,"In 2176, a Martian police unit is sent to pick...",14.189,/i2zztssCIbahGES1fdfWFmDXian.jpg,Terror is the same on any planet.,"Action, Horror, Science Fiction","Animationwerks, Screen Gems, Storm King Produc...",United States of America,English,"future, planet mars, anti hero, possession, ho..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",white of the eye,...,White of the Eye,"In a wealthy and isolated desert community, a ...",8.297,/aLFoGmQpknOvcyx4imCP7Fuvoip.jpg,No woman is safe… while he is loose!,"Horror, Thriller",Mrs. White's Productions,United Kingdom,English,"based on novel or book, gas station, psychopat..."
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",a woman in flames,...,Die flambierte Frau,"Eva, an upper-class housewife, frustratedly le...",2.801,/iKeCVR0x8vnRXLtjwjZ7EuVFlo7.jpg,,Drama,Dieter Geissler Filmproduktion,Germany,German,"jealousy, eroticism, gigolo, longing, dominatr..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...",the sorcerers apprentice,...,The Sorcerer's Apprentice,Balthazar Blake is a master sorcerer in modern...,28.585,/b5pIUsGll0418NyfNA5eYCI9aoK.jpg,It's The Coolest Job Ever.,"Fantasy, Adventure, Action","Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,English,"witch, mission, magic, mystic, castle, sorcere..."
10,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...",little city,...,Little City,Best friends Adam and Kevin have a lot in comm...,1.556,/fIynQQgyxWCs0kNJ7YHKTRNwaZI.jpg,When your one and only... isn't the only one!,"Comedy, Romance","Bandeira Entertainment, Miramax",,English,


In [None]:
commom_movies_cmu_tmdb.drop(columns=['vote_average', 'vote_count', 'status', 'adult', 'backdrop_path'])

In [None]:
'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [105]:
df_plot_summaries = pd.read_csv('./MovieSummaries/MovieSummaries/plot_summaries.txt', sep='\t', header=None, names=['Wikipedia_ID', 'Plot'])

In [106]:
df_plot_summaries.head()

Unnamed: 0,Wikipedia_ID,Plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [108]:
common_movies_cmu_plot_summaries = pd.merge(df_cmu, df_plot_summaries, on='Wikipedia_ID', how='inner')
common_movies_cmu_plot_summaries = common_movies_cmu_plot_summaries.drop_duplicates(subset='Wikipedia_ID')
common_movies_cmu_plot_summaries.head()

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Movie_languages,Movie_countries,Movie_genres,normalized_name,Plot
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",ghosts of mars,"Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",white of the eye,A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",a woman in flames,"Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...",the sorcerers apprentice,"Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...",little city,"Adam, a San Francisco-based artist who works a..."


In [37]:
# Get reviews
movielens_reviews = pd.read_csv('Movieslens/ml-32m/ratings.csv')

In [38]:
movielens_reviews.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [None]:
test = movielens_reviews.groupby(['userId', 'movieId'])
test

In [41]:
high_ratings = movielens_reviews[movielens_reviews['rating'] >= 4.0]


users_with_high_ratings = (
    high_ratings.groupby('movieId')
    .filter(lambda x: x['userId'].nunique() > 1)
)

print(users_with_high_ratings)

          userId  movieId  rating   timestamp
0              1       17     4.0   944249077
3              1       30     5.0   944249077
4              1       32     5.0   943228858
7              1       80     5.0   944248943
9              1      111     5.0   944249008
...          ...      ...     ...         ...
32000195  200948    72998     5.0  1350423889
32000196  200948    74458     4.5  1350423822
32000197  200948    76093     5.0  1287223498
32000199  200948    79702     4.5  1294412589
32000203  200948    87304     4.5  1350423523

[15921569 rows x 4 columns]


In [137]:
df_tmdb['keywords'][0]

'rescue, mission, dream, airplane, paris, france, virtual reality, kidnapping, philosophy, spy, allegory, manipulation, car crash, heist, memory, architecture, los angeles, california, dream world, subconscious'

In [132]:
df_tmdb.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,normalized_name
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",inception
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",interstellar
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",the dark knight
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",avatar
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",the avengers


## Find missing information in CMU using TMdB dataset 

In [117]:
df_cmu.head()

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Movie_languages,Movie_countries,Movie_genres,normalized_name
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",ghosts of mars
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",getting away with murder the jonbenét ramsey m...
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",brun bitter
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",white of the eye
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",a woman in flames


In [119]:
df_cmu.isna().sum()

Wikipedia_ID           0
Freebase_ID            0
Movie_name             0
Release_date        6902
Box_office         73340
Runtime            20450
Movie_languages        0
Movie_countries        0
Movie_genres           0
normalized_name        0
dtype: int64

In [120]:
nan_indices_release_date = df_cmu[df_cmu['Box_office'].isna()].index

In [131]:
for i in nan_indices_release_date[:10]:
    name = normalize_movie_name(df_cmu.iloc[i]['Movie_name'])
    print(df_cmu.iloc[i]['Movie_name'])
    print(df_cmu.iloc[i]['Release_date'])
    print(df_tmdb[df_tmdb['normalized_name'] == name]['release_date'])
    print(df_tmdb[df_tmdb['normalized_name'] == name]['title'])
    print('--------------------')

The Mechanical Monsters
nan
31444    1941-11-28
Name: release_date, dtype: object
31444    The Mechanical Monsters
Name: title, dtype: object
--------------------
Boadicea
nan
385410    1966-09-22
896601    1927-09-05
Name: release_date, dtype: object
385410    Boadicea
896601    Boadicea
Name: title, dtype: object
--------------------
Les Indiens sont encore loin
nan
Series([], Name: release_date, dtype: object)
Series([], Name: title, dtype: object)
--------------------
Donald's Crime
nan
35909    1945-06-29
Name: release_date, dtype: object
35909    Donald's Crime
Name: title, dtype: object
--------------------
The Last Trackers of the Outback
nan
958010    NaN
Name: release_date, dtype: object
958010    The Last Trackers of the Outback
Name: title, dtype: object
--------------------
Mighty Mouse in the Great Space Chase
nan
130344    1982-12-10
Name: release_date, dtype: object
130344    Mighty Mouse in the Great Space Chase
Name: title, dtype: object
--------------------
Ethe Naas

In [124]:
df_tmdb

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,normalized_name
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",inception
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",interstellar
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",the dark knight
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",avatar
4,24428,The Avengers,7.710,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",the avengers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129222,729506,Love Lives,0.000,0,Released,,0,14,False,,...,"Filmed over several days in Sheffield, UK, thi...",0.600,,,,,,,,love lives
1129223,729508,Close your eyes to see better,0.000,0,Released,1986-01-01,0,23,False,,...,This video was made in 1986 by residents of Ha...,0.600,,,,,,German,,close your eyes to see better
1129224,729509,Men Who Don't Work,0.000,0,Released,2011-09-29,0,12,False,,...,When an unusual family moves into a house on h...,0.600,/eTVp4OyCrbNkKBsKz30Dtc7sVdG.jpg,,Drama,,United States of America,English,,men who dont work
1129225,729510,Torture in Stammheim prison,0.000,0,Released,2005-01-01,0,45,False,,...,When a law enforcement officer unravels the ce...,0.600,,,,,,German,,torture in stammheim prison


In [160]:
df_tmdb['overview']

0          Cobb, a skilled thief who commits corporate es...
1          The adventures of a group of explorers who mak...
2          Batman raises the stakes in his war on crime. ...
3          In the 22nd century, a paraplegic Marine is di...
4          When an unexpected enemy emerges and threatens...
                                 ...                        
1129222    Filmed over several days in Sheffield, UK, thi...
1129223    This video was made in 1986 by residents of Ha...
1129224    When an unusual family moves into a house on h...
1129225    When a law enforcement officer unravels the ce...
1129226    In the outskirts of Palawan, a man hunts for i...
Name: overview, Length: 1129227, dtype: object

In [158]:
df_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [161]:
commom_movies_cmu_tmdb2.drop(columns=['vote_average', 'vote_count', 'status', 'adult', 'backdrop_path', 'homepage', 
                                     'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 
                                      'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages'])

Unnamed: 0,Wikipedia_ID,Freebase_ID,Movie_name,Release_date,Box_office,Runtime,Movie_languages,Movie_countries,Movie_genres,normalized_name,id,title,release_date,revenue,runtime,budget,keywords
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",ghosts of mars,10016,Ghosts of Mars,2001-08-24,14010832,98,28000000,"future, planet mars, anti hero, possession, ho..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",getting away with murder the jonbenét ramsey m...,784579,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,0,60,0,"colorado, jonbenet"
2,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",white of the eye,33592,White of the Eye,1987-06-19,0,111,0,"based on novel or book, gas station, psychopat..."
3,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",a woman in flames,11192,A Woman in Flames,1983-05-11,0,106,0,"jealousy, eroticism, gigolo, longing, dominatr..."
4,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",the gangsters,263493,The Gangsters,1913-05-29,0,35,0,"slapstick comedy, keystone kops"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218023,31353375,/m/0gkz70w,The Luck of the Navy,1927-11-22,,110.0,{},"{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/06ppq"": ""Silent ...",the luck of the navy,295963,The Luck of the Navy,1927-11-22,0,110,0,
218024,26482675,/m/0bbwngb,Eşrefpaşalılar,2010-03-05,1847671.0,,{},{},"{""/m/05p553"": ""Comedy film"", ""/m/07s9rl0"": ""Dr...",eşrefpaşalılar,50009,Eşrefpaşalılar,2010-03-05,0,103,0,
218044,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",mermaids the body found,117124,Mermaids: The Body Found,2011-03-19,0,82,0,"mermaid, mockumentary, fake documentary"
218045,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",knuckle,71771,Knuckle,2011-12-09,0,97,0,"sports, bare knuckle boxing, fistfight"


In [None]:
# TO DO 
# 1) Find missing values and correct wrong values using TMdB dataset
# 2) Work with 