# Imports

In [1]:
import pandas as pd
import numpy as np
import ast

# Loading the Data

In [3]:
messy_data = pd.read_csv(r"C:\Users\CPL17\OneDrive\Documents\Data\movies_metadata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
messy_data[:1]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [6]:
messy_data.drop(axis=1,columns=['adult', 'imdb_id', 'original_title', 'video','homepage'],inplace=True)

The following have nested data:

* belongs_to_collection
* genres
* production_companies
* production_countries
* spoken_languages	

In [7]:
json_cols = ['belongs_to_collection','genres', 'production_companies', 'production_countries', 'spoken_languages']

# Transform JSON Columns

### Convert strings to python object

In [None]:
#Example column 
messy_data.genres[0] 

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [8]:
# Evaluate string to convert to python objects 

for col in  ['belongs_to_collection','genres', 'production_companies', 'production_countries', 'spoken_languages']:
  messy_data[col] = messy_data[col].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else np.nan)

In [11]:
messy_data.loc[0,["belongs_to_collection"]]

belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
Name: 0, dtype: object

### Flatten nested columns

In [None]:
# Structure of for each col in json_cols:
  print(col,type(messy_data[col][0]))

belongs_to_collection <class 'dict'>
genres <class 'list'>
production_companies <class 'list'>
production_countries <class 'list'>
spoken_languages <class 'list'>


In [12]:
## Dict
messy_data.belongs_to_collection = messy_data.belongs_to_collection.apply(lambda x: x['name'] if isinstance(x,dict) else np.nan)

## Lists

json_cols.remove('belongs_to_collection')
for col in json_cols:
  messy_data[col] = messy_data[col].apply(lambda x: "|".join(i['name'] for i in x) if isinstance(x,list) else np.nan)

messy_data.loc[0,["belongs_to_collection","genres"]]

belongs_to_collection       Toy Story Collection
genres                   Animation|Comedy|Family
Name: 0, dtype: object

In [None]:
# Empty strings not marked as missing in production_companies

messy_data.production_companies.value_counts(dropna = False).head(3) 

                             11875
Metro-Goldwyn-Mayer (MGM)      742
Warner Bros.                   540
Name: production_companies, dtype: int64

In [13]:
messy_data.production_companies.replace("", np.nan, inplace = True)

## Cleaning Numerical Columns

In [None]:
messy_data.dtypes

belongs_to_collection     object
budget                    object
genres                    object
id                        object
original_language         object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
vote_average             float64
vote_count               float64
dtype: object

### Deal with incorrectly entered data

In [None]:
# Problems with budget and id 

In [None]:
messy_data[messy_data.budget == '/ff9qCepilowshEtG2GYWwzt2bs4.jpg']

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
19730,,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,Carousel Productions|Vision View Entertainment...,1997-08-20,104.0,Released,,Midnight Man,,,1,,,,,,,,


In [None]:
messy_data.budget.str.contains('jpg').sum()

3

In [None]:
messy_data.id.str.contains('-').sum() 

3

In [19]:
# Assume they are the same entries

s = messy_data.id.str.contains('-')
messy_data = messy_data[~s]

### Convert columns and check dtype

In [30]:
# Converting budget, id and popularity

cols = ["budget","id","popularity"]

for col in cols:
    messy_data[col] = pd.to_numeric(messy_data[col],errors='coerce')
    print(messy_data[col].dtype)

int64
int64
float64


In [36]:
messy_data = messy_data[~(messy_data.popularity.isna())]

## Update values in budget, runtime and popularity with zero values

# Change runtime zeroes to mean of non-zero entries

s = messy_data.runtime
mean = s[s!=0].mean()

messy_data.loc[s == 0,'runtime'] = mean

### Use regression to predict budget and revenue

In [40]:
from sklearn.linear_model import LinearRegression

predictors = ['popularity','vote_count','vote_average']
predictor_df = messy_data[predictors].dropna() #To maintain consistency with response vector lengths

# Get a training df with no-nonzero entries 

for column in predictors:
  predictor_df = predictor_df[predictor_df[column]!=0]

train = predictor_df.sample(5000).values


In [41]:
# Function to predict zeroes of specified column 

def predict_zeros(column:str,data,predictors,n_train):

  col = cata[col]

  train_responses = col[col!=0].dropna().sample(n_train).values
  test = data.loc[col==0,predictors].values

  lm = LinearRegression()
  lm.fit(train,train_responses)

  revenue_pred = lm.predict(test)

  return revenue_pred

In [44]:
# Predict Revenue zero values 

revenue = messy_data['revenue']

train_responses = revenue[revenue!=0].dropna().sample(5000).values
test = messy_data.loc[revenue==0,predictors].values

lm = LinearRegression()
lm.fit(train,train_responses)

revenue_pred = lm.predict(test)

In [45]:
messy_data.loc[revenue==0,'revenue'] = revenue_pred

In [46]:
# Predict Budget 

budget = messy_data['budget']

train_responses = budget[budget!=0].dropna().sample(5000).values
test = messy_data.loc[budget==0,predictors].dropna().values
idx = messy_data.loc[budget==0,predictors].dropna().index

lm = LinearRegression()
lm.fit(train,train_responses)
budget_pred = lm.predict(test)

In [48]:
messy_data.loc[idx ,'budget'] = budget_pred

12. The columns "budget" and "revenue" shall show values in Million USD. __Convert and Overwrite__!

In [49]:
# Convert to units in millions 

messy_data.budget = messy_data.budget / 1000000
messy_data.revenue = messy_data.revenue / 1000000

In [None]:
# Some movies have a vote count of 0, can't have an average of a zero vote. 

messy_data.loc[messy_data.vote_count==0,'vote_average'] = np.nan 

## Cleaning DateTime Columns

In [50]:
messy_data.release_date = pd.to_datetime(messy_data.release_date, errors='coerce')

In [None]:
messy_data.release_date.dtype

dtype('<M8[ns]')

## Cleaning Text / String Columns

15. __Analyze__ the text columns "overview" and "tagline". Try to identify __missing data that is not represented by NaN__ (e.g. "No Data"). __Replace as NaN__ (np.nan)!

In [57]:
# Some missing values are enconded incorrectly 

messy_data.overview.value_counts(dropna = False).head(10) 

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        1102
Adaptation of the Jane Austen novel.                                                                                                                                                                                                                                                                                                                                                                                                    

In [58]:
messy_data.overview.replace("No overview found.", np.nan, inplace = True)
messy_data.overview.replace("No Overview", np.nan, inplace = True)
messy_data.overview.replace("No movie overview available.", np.nan, inplace = True)
messy_data.overview.replace(" ", np.nan, inplace = True)
messy_data.tagline.replace("-", np.nan, inplace = True)

## Removing Duplicates

In [59]:
messy_data.drop_duplicates(subset = "id", inplace = True)

## Handling Other Missing Values & Removing Observations

In [60]:
# Drop all rows/movies with unknown id or title
messy_data.loc[:,["id", "title"]].dropna(inplace = True)

In [61]:
# Keep rows with less than 10 non-NAN values 
messy_data.dropna(thresh=10,inplace=True)

## Final (Cleaning) Steps

In [62]:
# Only keep already release movies 

df = messy_data.loc[messy_data.status == "Released"].copy()
df.drop(columns = ["status"], inplace = True)

In [63]:
# Rearange order of columns 

cols = ["id", "title", "tagline", "release_date", "genres", "belongs_to_collection", 
"original_language", "budget", "revenue", "production_companies",
"production_countries", "vote_count", "vote_average", "popularity", "runtime",
"overview", "spoken_languages", "poster_path"]

df = df.loc[:,cols]

In [64]:
# Reset index 

df.reset_index(drop = True, inplace =True)

In [67]:
# Update poster path 

base_poster_url = 'http://image.tmdb.org/t/p/w185/'
df.poster_path = "<img src='" + base_poster_url + df.poster_path + "' style='height:100px;'>"

In [71]:
df.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget,revenue,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path
0,862,Toy Story,,1995-10-30,Animation|Comedy|Family,Toy Story Collection,en,30.0,373.554033,Pixar Animation Studios,United States of America,5415.0,7.7,21.946943,81.0,"Led by Woody, Andy's toys live happily in his ...",English,<img src='http://image.tmdb.org/t/p/w185/<img ...
1,8844,Jumanji,Roll the dice and unleash the excitement!,1995-12-15,Adventure|Fantasy|Family,,en,65.0,262.797249,TriStar Pictures|Teitler Film|Interscope Commu...,United States of America,2413.0,6.9,17.015539,104.0,When siblings Judy and Peter discover an encha...,English|Français,<img src='http://image.tmdb.org/t/p/w185/<img ...
2,15602,Grumpier Old Men,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,Romance|Comedy,Grumpy Old Men Collection,en,21.476312,67.282594,Warner Bros.|Lancaster Gate,United States of America,92.0,6.5,11.7129,101.0,A family wedding reignites the ancient feud be...,English,<img src='http://image.tmdb.org/t/p/w185/<img ...
3,31357,Waiting to Exhale,Friends are the people who let you be yourself...,1995-12-22,Comedy|Drama|Romance,,en,16.0,81.452156,Twentieth Century Fox Film Corporation,United States of America,34.0,6.1,3.859495,127.0,"Cheated on, mistreated and stepped on, the wom...",English,<img src='http://image.tmdb.org/t/p/w185/<img ...
4,11862,Father of the Bride Part II,Just When His World Is Back To Normal... He's ...,1995-02-10,Comedy,Father of the Bride Collection,en,21.825495,76.578911,Sandollar Productions|Touchstone Pictures,United States of America,173.0,5.7,8.387519,106.0,Just when George Banks has recovered from his ...,English,<img src='http://image.tmdb.org/t/p/w185/<img ...
