In [47]:
# dependencies
import pandas as pd
import json
import ast
from datetime import datetime
import scipy.stats as st
import numpy as np

### Reading in data
<p>The <a href="https://www.kaggle.com/rounakbanik/the-movies-dataset?select=credits.csv">dataset</a> was obtained from Kaggle and consists of the following files:<p>

* **movies_metadata.csv**: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.

* **keywords.csv**: Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.

* **credits.csv**: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.

* **links.csv**: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.

* **links_small.csv**: Contains the TMDB and IMDB IDs of a small subset of 9,000 movies of the Full Dataset.

* **ratings_small.csv**: The subset of 100,000 ratings from 700 users on 9,000 movies.

# Creating Production Table

### Exploring data

In [48]:
credits = pd.read_csv("Data/credits.csv")
keywords = pd.read_csv("Data/keywords.csv")
movies_meta = pd.read_csv("Data/movies_metadata.csv", low_memory=False)
ratings = pd.read_csv("Data/ratings.csv")
ratings_small = pd.read_csv("Data/ratings_small.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Data/credits.csv'

In [49]:
credits.head()

AttributeError: '_Printer' object has no attribute 'head'

In [50]:
keywords.head()

NameError: name 'keywords' is not defined

In [51]:
movies_meta.head()

NameError: name 'movies_meta' is not defined

In [52]:
# Review all available columns in movies metadata table
print(f"Number of columns:\t{len(movies_meta.columns)}")
print("Column names:", end="\t\t")
for column in movies_meta.columns:
    print(column, end=", ")

NameError: name 'movies_meta' is not defined

In [53]:
movies_meta.iloc[:,:11].head()

NameError: name 'movies_meta' is not defined

In [54]:
movies_meta.iloc[:,12:].head()

NameError: name 'movies_meta' is not defined

In [55]:
# See number of records in ratings data table
print(f"Total number of ratings records:\t {ratings.shape[0]}")

# View ratings table
ratings.head()

NameError: name 'ratings' is not defined

### Transforming data (convert ratings timestamp, production)

In [56]:
# Convert ratings timestamp to date

In [57]:
# Create custom function to convert time data
def convert_tstamp_to_date(x):
    y = datetime.fromtimestamp(x).strftime('%Y-%m-%d')
    return y

# Test if the function works
convert_tstamp_to_date(ratings['timestamp'][0])

NameError: name 'ratings' is not defined

In [58]:
ratings['date']=ratings['timestamp'].apply(convert_tstamp_to_date)
ratings.head()

NameError: name 'ratings' is not defined

In [59]:
# From the movie_meta df select and transform only columns 
# that include json format nestedentries

movie_meta_transformed = []

for i, row in movies_meta.iterrows():
    #     print(f"Processing ---  Row {i}")
    
    # Parsing collections data    
    coll_data = row['belongs_to_collection']
    try :
        coll_id   = json.loads(coll_data.replace("'",'"'))['id']
        coll_name = json.loads(coll_data.replace("'",'"'))['name']
        coll_poster = json.loads(coll_data.replace("'",'"'))['poster_path']
    except :
        coll_id=row['belongs_to_collection']
        coll_name=row['belongs_to_collection']
        coll_poster=row['belongs_to_collection']
    
    # Parsing production companies
    prod_comp_data = row['production_companies']
    
    try :
        prod_comp_name = []
        prod_comp_name_str = ""
        counter=0
        for comp_name in json.loads(prod_comp_data.replace("'",'"')):
            name=comp_name['name']
            prod_comp_name.append(name)
            if counter == 0:
                prod_comp_name_str += f"{name}"
                counter += 1
            else :
                prod_comp_name_str += f", {name}"
    except:
        prod_comp_name_str = row['production_companies']
    
    # Parsing production countries
    prod_count_data = row['production_countries']
    
    try :
        prod_count_name = []
        prod_count_name_str = ""
        counter=0
        for count_name in json.loads(prod_count_data.replace("'",'"')):
            name = count_name['name']
            abb  = count_name['iso_3166_1']
            prod_count_name.append(name)
            if counter == 0:
                prod_count_name_str += f"{name}"
                counter += 1
            else :
                prod_count_name_str += f", {name}"
    except :
        prod_count_name_str = row['production_countries']
        abb = ""
    
    # Pull row data into dictionary
    movies_dic={
        "movieId":           row['id'],
        "collectionId":      coll_id,
        "collectionName":    coll_name,
        'productionCompanies':
                             prod_comp_name_str,
        'productionCountries':
                             prod_count_name_str, 
        'countryAbb':        abb,
        # additionally parse out release year
        'releaseYear':       str(row['release_date']).split('-')[0]
    }
    
    movie_meta_transformed.append(movies_dic)

movie_meta_transformed = pd.DataFrame(movie_meta_transformed)
movie_meta_transformed.head()

NameError: name 'movies_meta' is not defined

In [60]:
pd.DataFrame(movie_meta_transformed.describe().loc['unique'])

AttributeError: 'list' object has no attribute 'describe'

In [61]:
# Extract all possible genres
all_genres=[]
for i, row in movies_meta.iterrows():
    #     print(f"Processing ---  Row {i}")
    
    # Parsing genres data
    genres_data = row['genres']
    
    try :
        for genre in json.loads(genres_data.replace("'",'"')):
            name = genre['name']
            
            if name not in all_genres:
                all_genres.append(name)
    except :
        print("hi")

len(all_genres)

NameError: name 'movies_meta' is not defined

In [62]:
# Extract all possible collections
all_collections=[]
for i, row in movies_meta.iterrows():
    #     print(f"Processing ---  Row {i}")
    
    # Parsing genres data
    coll_data = row['belongs_to_collection']
    
    try :
        name = json.loads(coll_data.replace("'",'"'))['name']
        if name not in all_collections:
            all_collections.append(name)
    except :
        all_collections = all_collections

len(all_collections)

NameError: name 'movies_meta' is not defined

# Creating Budget/Revenue Table

In [63]:
# budget and revenue data
data = pd.read_csv('Data/movies_metadata.csv')
data = pd.DataFrame(data)
data = data[['id','title','budget','revenue']]
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/movies_metadata.csv'

# Creating Rating Table

In [64]:
# ratings file
rating = pd.read_csv('Data/ratings.csv')
rating.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Data/ratings.csv'

In [65]:
rating_df = pd.DataFrame(rating)
rating_df = rating[['movieId','rating']]
rating_df.head()

NameError: name 'rating' is not defined

In [66]:
rating_df.rename(columns={'movieId':'id'}, inplace=True,)
rating_df.head()

NameError: name 'rating_df' is not defined

In [67]:
# Average rating by movie id
rating_df = rating_df.groupby('id')['rating'].mean().reset_index()
rating_df

NameError: name 'rating_df' is not defined

In [68]:
# movies dataset
movies = pd.read_csv('Data/movies_metadata.csv')
movies = pd.DataFrame(movies)
movies = movies[['id','title','budget','revenue']]
movies.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Data/movies_metadata.csv'

In [69]:
# include a column to calculate the profit for each movie
movies["profit"] = ""
movies

NameError: name 'movies' is not defined

In [70]:
# cleaning the budget column
clean_movies = movies[movies["budget"] != "/ff9qCepilowshEtG2GYWwzt2bs4.jpg"]
clean_movies = clean_movies[clean_movies["budget"] != "/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg"]
clean_movies = clean_movies[clean_movies["budget"] != "/zaSf5OG7V8X8gqFvly88zDdRm46.jpg"]
clean_movies

NameError: name 'movies' is not defined

In [71]:
# Budget and revenue as float
clean_movies['budget'] = clean_movies['budget'].astype(float)
clean_movies['revenue'] = clean_movies['revenue'].astype(float)
clean_movies

NameError: name 'clean_movies' is not defined

In [72]:
# obtain the profit as the difference between revenue and budget
clean_movies['profit'] = clean_movies['revenue'] - clean_movies['budget']
clean_movies

NameError: name 'clean_movies' is not defined

In [73]:
# rating_df

In [74]:
# Identify the type
clean_movies.dtypes,rating_df.dtypes

NameError: name 'clean_movies' is not defined

In [75]:
# converting to the same type of variable
clean_movies['id'] = clean_movies['id'].astype('int64')

NameError: name 'clean_movies' is not defined

In [76]:
#same types
clean_movies.dtypes,rating_df.dtypes

NameError: name 'clean_movies' is not defined

In [77]:
# merge both cleanig dataframes

movie_rating = clean_movies.merge(rating_df, on='id', how='left')
movie_rating

NameError: name 'clean_movies' is not defined

In [78]:
print(ratings.count(), ratings['userId'].nunique())

NameError: name 'ratings' is not defined

# Creating Genres and Profit Tables

In [79]:
# Reference to CSV and reading CSV into Pandas DataFrame
csv_path1 = "Data/credits.csv"
csv_path2 = "Data/keywords.csv"
csv_path3 = "Data/links.csv"
csv_path4 = "Data/links_small.csv"
csv_path5 = "Data/movies_metadata.csv"
csv_path6 = "Data/ratings.csv"
csv_path7 = "Data/ratings_small.csv"


# merging final combined CSV meta file


credits_df = pd.read_csv(csv_path1)
keywords_df = pd.read_csv(csv_path2)
links_df = pd.read_csv(csv_path3)
links_small_df = pd.read_csv(csv_path4)
movies_metadata_df = pd.read_csv(csv_path5)
ratings_df = pd.read_csv(csv_path6)
ratings_small_df = pd.read_csv(csv_path7)


FileNotFoundError: [Errno 2] No such file or directory: 'Data/credits.csv'

In [80]:
movie_genres =movies_metadata_df["genres"]
movie_genres.head()


NameError: name 'movies_metadata_df' is not defined

In [81]:
def extract_names(list_genres):
    list_genres = json.loads(list_genres.replace("'",'"'))

    genres_names = []
    for item in list_genres:
        
        genres_names.append(item['name'])
    return ",".join(genres_names)

In [82]:
extract_names(movie_genres[0])

NameError: name 'movie_genres' is not defined

In [83]:
movie_genres.apply(extract_names)

NameError: name 'movie_genres' is not defined

In [84]:
#test = '[{"id": 16, "name": "Animation"}]'
test=movie_genres[0].replace("'",'"')
json.loads(test)


NameError: name 'movie_genres' is not defined

In [85]:
movies_metadata_df["genres"] =movie_genres.apply(extract_names)
movies_metadata_df.head()

NameError: name 'movie_genres' is not defined

In [86]:
movie_vote_df = movies_metadata_df[['id','title','genres','vote_average']]
movie_vote_df


NameError: name 'movies_metadata_df' is not defined

# Creating keyword table

In [87]:
movies = pd.read_csv('Data\movies_metadata.csv')
movies = pd.DataFrame(movies)
movies = movies[['id','title','overview','release_date']]
movies.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data\\movies_metadata.csv'

In [88]:
keywords = pd.read_csv('Data\keywords.csv')
keywords = pd.DataFrame(keywords)
keywords = keywords[['id','keywords']]
keywords.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data\\keywords.csv'

In [89]:
keyword1 = []
keyword2 = []
keyword3 = []
keys = keywords['keywords']
movie_id = list(keywords['id'])

for key in keys:

    words = key
    words = ast.literal_eval(words)
        
    try:
        words1 = words[0]['name']
    except:
        words1 = 'null'
        
    try:
        words2 = words[1]['name']
    except:
        words2 = 'null'
        
    try:
        words3 = words[2]['name']
    except:
        words3 = 'null'
        
    keyword1.append(words1)
    keyword2.append(words2)
    keyword3.append(words3)

keywords_df = {
    'movieid': movie_id,
    'keyword1': keyword1,
    'keyword2': keyword2,
    'keyword3': keyword3
}

keywords_df = pd.DataFrame(keywords_df)

keywords_df.head()

NameError: name 'keywords' is not defined

In [90]:
keywords_df.to_sql(name='keywords', con=engine, if_exists='append', index=False)

NameError: name 'keywords_df' is not defined

In [91]:
keywords_df.to_csv("keywords.csv", index=False)

NameError: name 'keywords_df' is not defined