# Importing Libraries and Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_credits = pd.read_csv('../raw_data/credits.csv')
df_movies = pd.read_csv('../raw_data/movies_dataset.csv')

  df_movies = pd.read_csv('../raw_data/movies_dataset.csv')


# Movies

In [3]:
df_movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
df_movies.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

## Transforming Columns

In [5]:
import ast

ast.literal_eval(df_movies["belongs_to_collection"][0]) # convert string elements in this column to dictionaries

{'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}

In [6]:
df_movies["collection"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["belongs_to_collection"]]

In [7]:
df_movies["collection"][0].get("name") # get the value using the key

'Toy Story Collection'

In [8]:
df_movies["collection"] = [x.get("name") if type(x) == dict else np.nan for x in df_movies["collection"]] # new column that will replace "belong_to_collection"

The following columns are lists of dictionaries, so they will need slightly different treatment.

In [9]:
df_movies["genres_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["genres"]]

In [10]:
df_movies["genres_list"][0] 

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [11]:
df_movies["genres_list"][0][0]["name"]

'Animation'

In [12]:
 # for every element in the column, we do a comprehension list
df_movies["genres_list"] = [[genre["name"] for genre in element] for element in df_movies["genres_list"]]

Repeating this process for other columns with similar structure.

In [13]:
# converting strings to lists

df_movies["spoken_languages_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["spoken_languages"]]
df_movies["production_companies_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_companies"]]
df_movies["production_countries_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_countries"]]

In [14]:
df_movies["spoken_languages_list"] = [[language["iso_639_1"] for language in element] if type(element) == list else np.nan for element in df_movies["spoken_languages_list"]]

In [15]:
df_movies["production_companies_list"] = [[company["name"] for company in element] if type(element) == list else np.nan for element in df_movies["production_companies_list"]]

In [16]:
df_movies["production_countries_list"] = [[country["iso_3166_1"] for country in element] if type(element) == list else np.nan for element in df_movies["production_countries_list"]]

Now we drop the columns we won't goint to use.

In [17]:
discard_columns = ["video","imdb_id","adult","original_title","poster_path", 
                   "homepage","belongs_to_collection", "genres", 
                   "production_companies", "production_countries",
                    "spoken_languages" ]

In [18]:
df_movies.drop(columns=discard_columns, inplace=True)

## Handling Missing Values and Adding Columns

In [19]:
df_movies.isna().mean()

budget                       0.000000
id                           0.000000
original_language            0.000242
overview                     0.020983
popularity                   0.000110
release_date                 0.001914
revenue                      0.000132
runtime                      0.005785
status                       0.001914
tagline                      0.551049
title                        0.000132
vote_average                 0.000132
vote_count                   0.000132
collection                   0.901223
genres_list                  0.000000
spoken_languages_list        0.000132
production_companies_list    0.000132
production_countries_list    0.000132
dtype: float64

In [20]:
# there are non numerical values in budget column so it can not be converted to float
df_movies[df_movies['budget'].str.contains(r'\D')] 

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list
19730,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,1997-08-20,104.0,Released,,1,,,,,,,,,"[Carousel Productions, Vision View Entertainme...",,,
29503,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,2012-09-29,68.0,Released,,12,,,,,,,,,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...",,,
35587,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,2014-01-01,82.0,Released,Beware Of Frost Bites,22,,,,,,,,,"[Odyssey Media, Pulser Productions, Rogue Stat...",,,


In [21]:
def convert_to_float(value): 
    try:
        return float(value) 
    except (ValueError, TypeError): # if there is an error, it will replace the strange values for null values
        return np.nan

In [22]:
df_movies["budget"] = [convert_to_float(x) for x in df_movies["budget"]]

In [23]:
df_movies["budget"].fillna(0, inplace=True)
df_movies["revenue"].fillna(0, inplace=True)

In [24]:
df_movies['release_date'].sort_values() # there are values that we can not convert in date type 

19730             1
29503            12
34940    1874-12-09
34937    1878-06-14
41602    1883-11-19
            ...    
45148           NaN
45203           NaN
45338           NaN
45410           NaN
45461           NaN
Name: release_date, Length: 45466, dtype: object

In [37]:
def convert_to_datetime(value): 
    try:
        return pd.to_datetime(value, format="%Y-%m-%d")
    except (ValueError, TypeError):  
        return np.nan

In [40]:
df_movies['release_date'] = [convert_to_datetime(x) for x in df_movies['release_date']]
df_movies["release_year"] = df_movies["release_date"].dt.year

In [32]:
df_movies.dropna(subset=['release_date'], inplace=True) # drop rows only if there are null values in "release_date" column

In [None]:
# df_movies['release_date'] = df_movies['release_date'].dt.date # if we only need the date

In [61]:
df_movies["return"] = np.where(df_movies["budget"] != 0, df_movies["revenue"] / df_movies["budget"], 0)

# Credits