<center>
<img src="https://www.themoviedb.org/assets/2/v4/logos/v2/blue_long_2-9665a76b1ae401a510ec1e0ca40ddcb3b0cfe45f1d51b77a308fea0845885648.svg" alt="TMDB Logo" class="center" height="50">

# TMDB Project Part 3

</center>

## Creating an SQL Database



### Import Libraries

In [None]:
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

### Data Contributions

This data has been downloaded from The Movie Database (TMDB) which is a community built movie and TV database. 

Link to TMDB Database: [Click Here](https://datasets.imdbws.com/)

### Import Libraries

In [None]:
import pandas as pd
import numpy as np

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

### Upload Data

In [None]:
# Open basics dataset and preview
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

In [None]:
# Open ratings dataset and preview
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

In [None]:
# Open api dataset and preview
api_results = pd.read_csv("Data/tmdb_results_combined.csv.gz", low_memory = False)
api_results.head()

## Filtering Data for Database

### Title Basics Required Information

- Movie ID (tconst)
- Primary Title
- Start Year
- Runtime (in Minutes)
- Genres

In [None]:
basics.info()

In [None]:
#dropping columns not wanted
basics_columns_todrop = ['titleType','originalTitle','isAdult','endYear']
basics.drop[columns=basics_columns_todrop, inplace=True]

In [None]:
#create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics

In [None]:
exploded_genres = basics.explode('genres_split')
exploded_genres

In [None]:
unique_genres = sorted(exploded_genres['genres_split'].unique())

In [None]:
title_genres = exploded_genres[['tconst','genres_split']].copy()
title_genres

In [None]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map

In [None]:
## make new integer genre_id and drop string genres
title_genres['genre_id'] = title_genres['genres_split'].map(genre_map)
#drop genres_split column
title_genres = title_genres.drop(columns='genres_split',inplace=True)
title_genres

In [None]:
genres = pd.Dataframe({'Genre Name':genre_map.keys(),'Genre ID':genre_map.values()})
genres.head()

In [None]:
#drop genres_split and genres from basics column
basics = basics.drop(columns=['genres_split','genres'],inplace=True)
title_genres

### Ratings Required Information

- Movie ID (tconst)
- Average Movie Rating
- Number of Votes

In [None]:
ratings.info()

### Creating SQL Dataset

Table Requirements:
- title_basics
- title_ratings
- title_genres
- genres
- tmdb_data

### Create Database

In [None]:
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
username = "root"
password = "root" # (or whatever password you chose during mysql installation)
db_name = "movies"

connection = f"mysql+pymysql://{username}:{password}@localhost/{db_name}"
engine = create_engine(connection)
engine

In [None]:
#create the database using variable for the actual name
create_database(connection)

In [None]:
#verify if database was created
database_exists(connection)

### Add Tables to Dataset

In [None]:
basics.dtypes

In [None]:
#calculate max string lengths for object columns
basics_len = basics['tconst'].fillna('').map(len).max()

In [None]:
from sqlalchemy.types import *
## Create a schema dictonary using Sqlalchemy datatype objects
basics_schema = {
    "tconst": String(basics_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':Float(),
    'endYear':Float(),
    'runtimeMinutes':Integer()}

In [None]:
#adding table to database with dtypes
basics.to_sql('title_basics',engine,dtype=basics_schema,if_exists='replace',index=False)

In [None]:
ratings.dtypes

In [None]:
#adding to table to database
ratings.to_sql('title_ratings', engine, if_exists = 'replace')

In [None]:
title_genres.dtypes

In [None]:
#adding to table to database
title_genres.to_sql('title_genres', engine, if_exists = 'replace')

In [None]:
genres.dtypes

In [None]:
#adding to table to database
genres.to_sql('genres', engine, if_exists = 'replace')

In [None]:
api_results.dtypes

In [None]:
#adding to table to database
api_results.to_sql('tmdb_data', engine, if_exists = 'replace')

In [None]:
q = """SHOW TABLES;"""
pd.read_sql(q, engine)