# Data extraction

We have seen that data contains only `movieID`, `year` and `title`. It looks why too less to make recommendation for users. That's why we used themoviedb API to get more data related to movies we already have.
Link: https://developer.themoviedb.org/docs/getting-started

In [1]:
import sys, os
import pandas as pd

In [2]:
import requests
from tqdm import tqdm

In [3]:
movies_file = os.path.join(os.getcwd(), 'data', 'movie_titles.csv')

try:
    movie_titles = pd.read_csv(
        movies_file, 
        header=None, 
        names=['movieID', 'year', 'title'], 
        encoding='ISO-8859-1', 
        on_bad_lines='skip'
    )
    print(movie_titles.head(20))
except FileNotFoundError:
    print(f"File not found: {movies_file}")
    print("Please check the path and ensure the file exists.")
except UnicodeDecodeError as e:
    print(f"Encoding error: {e}")
except pd.errors.ParserError as e:
    print(f"Parser error: {e}")

print(movie_titles.head())
print(movie_titles.columns)
print(len(movie_titles))

    movieID    year                                              title
0   movieID    year                                              title
1         1  2003.0                                    Dinosaur Planet
2         2  2004.0                         Isle of Man TT 2004 Review
3         3  1997.0                                          Character
4         4  1994.0                       Paula Abdul's Get Up & Dance
5         5  2004.0                           The Rise and Fall of ECW
6         6  1997.0                                               Sick
7         7  1992.0                                              8 Man
8         8  2004.0                         What the #$*! Do We Know!?
9         9  1991.0                           Class of Nuke 'Em High 2
10       10  2001.0                                            Fighter
11       11  1999.0                     Full Frame: Documentary Shorts
12       12  1947.0                               My Favorite Brunette
13    

In [4]:
TMDB_API_KEY = 'Your secret key'
TMDB_API_URL = 'https://api.themoviedb.org/3'

In [None]:
def fetch_movie_details(title):
    try:
        search_response = requests.get(
            f"{TMDB_API_URL}/search/movie",
            params={"api_key": TMDB_API_KEY, "query": title}
        )
        search_response.raise_for_status()
        search_results = search_response.json().get('results')
        
        if not search_results:
            return {
                'genres': None,
                'runtime': None,
                'original_language': None,
                'popularity': None,
                'adult': None
            }
        
        movie_id = search_results[0].get('id')
        
        details_response = requests.get(
            f"{TMDB_API_URL}/movie/{movie_id}",
            params={"api_key": TMDB_API_KEY}
        )
        details_response.raise_for_status()
        details = details_response.json()
        
        genres = ', '.join([genre['name'] for genre in details.get('genres', [])])
        runtime = details.get('runtime')
        original_language = details.get('original_language')
        popularity = details.get('popularity')
        adult = details.get('adult')
        
        return {
            'genres': genres,
            'runtime': runtime,
            'original_language': original_language,
            'popularity': popularity,
            'adult': adult
        }
    
    except requests.exceptions.RequestException as e:
        print(f"Request error for {title}: {e}")
        return {
            'genres': None,
            'runtime': None,
            'original_language': None,
            'popularity': None,
            'adult': None
        }
        
tqdm.pandas(desc="Fetching movie details")
movie_details = movie_titles['title'].progress_apply(fetch_movie_details)

details_df = pd.DataFrame(movie_details.tolist())

movie_titles = pd.concat([movie_titles, details_df], axis=1)

movie_titles.to_csv('Data/movie_titles_with_details.csv', index=False)

From `tmdb` API we added `genres`,`runtime`,`original_language`,`popularity` and `adult` features.