# Gathering, cleaning, storing data from TMDB API

## Importing necessary libraries

In [11]:
import ast

import pandas as pd
import requests

from constants import api_key

## Getting the data

### Getting IDs from first 100 pages of **Top Rated Movies** page.

In [12]:
def get_toprated(key, page):
    query = f'https://api.themoviedb.org/3/movie/top_rated?api_key={key}&language=en-US&page={page}'
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        return 'Error'

In [13]:
ids = []
for i in range(1, 101):
    toprated = get_toprated(api_key, i)
    for result in toprated['results']:
        try:
            id_ = result['id']
            ids.append(id_)
        except:
            continue

In [14]:
len(ids)  # 2000 unique films

2000

### Getting **details** of highest rated 2000 movies

In [15]:
def get_data(key, movie_id):
    query = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={key}'
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        return 'Error'

In [16]:
# Creating a dataframe with the first element on ids list. We will append the data we get to this dataframe
details = get_data(api_key, ids[0])
df = pd.DataFrame.from_dict(details, orient='index').transpose()
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/5hNcsnMkwU2LknLoru73c76el3z.jpg,,13200000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,19404,tt0112870,hi,दिलवाले दुल्हनिया ले जायेंगे,...,1995-10-20,100000000,190,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,"Come Fall In love, All Over Again..",Dilwale Dulhania Le Jayenge,False,8.8,3181


In [17]:
# Getting movie details from our id list
for i in ids[1:]:
    try:
        result = get_data(api_key, i)
        movie = pd.DataFrame.from_dict(result, orient='index').transpose()
        df = df.append(movie, ignore_index=True)
    except:
        continue

In [18]:
df.shape[0] == len(ids)  # We got details of 2000 unique movies

True

### Backup the data we got so far

In [19]:
df.to_csv('dirty2000.csv')

## Lets start working on cleaning

In [20]:
movies = pd.read_csv('top2000.csv')

In [21]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             2000 non-null   int64  
 1   adult                  2000 non-null   bool   
 2   backdrop_path          1998 non-null   object 
 3   belongs_to_collection  400 non-null    object 
 4   budget                 2000 non-null   int64  
 5   genres                 2000 non-null   object 
 6   homepage               782 non-null    object 
 7   id                     2000 non-null   int64  
 8   imdb_id                2000 non-null   object 
 9   original_language      2000 non-null   object 
 10  original_title         2000 non-null   object 
 11  overview               2000 non-null   object 
 12  popularity             2000 non-null   float64
 13  poster_path            2000 non-null   object 
 14  production_companies   2000 non-null   object 
 15  prod

In [22]:
# Drop the columns we have no use for
movies = movies[movies['adult'] == False]
movies = movies[
    ['id', 'title', 'genres', 'belongs_to_collection', 'vote_average',
     'vote_count', 'budget', 'revenue', 'popularity', 'production_companies',
     'production_countries', 'release_date', 'runtime', 'spoken_languages']]
movies = movies.set_index('id').sort_index()

## Unpacking values in certain categories

In [23]:
# Some columns have their own dictionaries inside them. Let's unpack them to a readable format
def unpacker(text):
    arr = ast.literal_eval(text)
    unpack_str = []
    if type(arr) == list:
        for j in range(len(arr)):
            unpack_str.append(arr[j]['name'])
    else:
        unpack_str.append(arr['name'])
    return ', '.join(unpack_str)

In [24]:
# Fill null values on "belongs_to_collection" column. Because they aren't really null values. They just don't belong to a collection.
fill = "[{'name':'No'}]"
movies.belongs_to_collection = movies.belongs_to_collection.fillna(fill)

# Perform unpacking
movies.belongs_to_collection = movies.belongs_to_collection.map(unpacker)
movies.production_companies = movies.production_companies.map(unpacker)
movies.production_countries = movies.production_countries.map(unpacker)
movies.spoken_languages = movies.spoken_languages.map(unpacker)
movies.genres = movies.genres.map(unpacker)

# Set "date" column as datetime object for functionality
movies.release_date = movies.release_date.astype('datetime64[ns]')

In [25]:
movies.head()

Unnamed: 0_level_0,title,genres,belongs_to_collection,vote_average,vote_count,budget,revenue,popularity,production_companies,production_countries,release_date,runtime,spoken_languages
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
11,Star Wars,"Adventure, Action, Science Fiction",Star Wars Collection,8.2,16077,11000000,775398007,61.158,"Lucasfilm Ltd., 20th Century Fox",United States of America,1977-05-25,121,English
12,Finding Nemo,"Animation, Family",Finding Nemo Collection,7.8,15507,94000000,940335536,84.129,Pixar,United States of America,2003-05-30,100,English
13,Forrest Gump,"Comedy, Drama, Romance",No,8.5,21346,55000000,677387716,47.654,"Paramount, The Steve Tisch Company",United States of America,1994-07-06,142,English
14,American Beauty,Drama,No,8.0,9666,15000000,356296601,23.156,"Jinks/Cohen Company, DreamWorks Pictures",United States of America,1999-09-15,122,English
15,Citizen Kane,"Mystery, Drama",No,8.0,4073,839727,23218000,17.939,"Mercury Productions, RKO Radio Pictures",United States of America,1941-04-17,119,English


In [26]:
movies.to_csv('cleaned2000.csv')