In [1]:
import pandas as pd

import os
from pathlib import Path

import sqlite3 as sql

### Establish Connection to uncleaned (raw) data:

In [2]:
project_path = Path(os.getcwd()).parent.absolute()
raw_data_path = project_path/'yify_collect_data'/'data'/'movie.db'

raw_conn = sql.connect(raw_data_path)
raw_cursor = raw_conn.cursor()

raw_db = pd.read_sql("SELECT * from movie", raw_conn)
raw_conn.close()

In [3]:
raw_db.head()

Unnamed: 0,title,year,genre,rating,href
0,The Founding of a Republic,2009,Action / Drama / History / War,4.9,https://yts.mx/movies/the-founding-of-a-republ...
1,Nude Tuesday,2022,Comedy / Drama / Romance,6.7,https://yts.mx/movies/nude-tuesday-2022
2,Tell Them Willie Boy Is Here,1969,Action / Drama / Western,6.3,https://yts.mx/movies/tell-them-willie-boy-is-...
3,Christopher Columbus: The Discovery,1992,Action / Adventure / Biography / Drama / History,4.4,https://yts.mx/movies/christopher-columbus-the...
4,Hyena,2014,Action / Crime / Drama / Thriller,6.2,https://yts.mx/movies/hyena-2014


#### About Data:
- title: Name of movie
- year: Year of release
- genre: Type of genre associated with movie
- rating: Overall ranking of the movie
- href: Link to movie on yts site

# Clean raw database:

In [4]:
def get_unique_genre(genre_column):
    """Returns sorted list of unique genres within column"""
    unique_genre = []
    for genres in genre_column:
        genres = genres.split('/')
        for genre in genres:
            cleaned_genre = genre.strip()
            if cleaned_genre not in unique_genre:
                unique_genre.append(cleaned_genre)
    return sorted(unique_genre)


def remove_null(data, null_count=0):
    """Remove row, if contains the same amount or more null values than the null count"""
    data = data.copy(deep=True)
    drop_rows = data[data.isnull().sum(axis=1) >= null_count].index
    return data.drop(index=drop_rows)


df = raw_db.copy()

# Convert genre to a list of strings
df.genre = [genre.strip().split('/') for genre in df.genre]

# Expand genre such that each row has one genre only
df = df.explode('genre')

# Remove punctuation and Capatilazation
df.genre = [genre.replace('-', ' ').strip().lower() for genre in df.genre]

# Sorted unique genre values
unique_genre = get_unique_genre(df.genre)

# Map genre to dictionary
genre_dict = {key:value for value, key in enumerate(unique_genre)}

# Apply map to dataframe
df.genre = [genre_dict[genre] for genre in df.genre]

# Update genre column name
df = df.rename({'genre':'genre_id'}, axis=1)

# Handle missing values
df = df.replace(['None'], None)
df = remove_null(df, null_count=1)

# Reset index
df = df.reset_index(drop=True)

# Remove movies that generated a 404 Error
df = df[df['title'] != '404 Error']


# -------------------------------------------------------------------------------------------------------------------
# Create new dataframes from cleaned df
movie = df.drop('genre_id', axis=1)
movie = movie.drop_duplicates(['title', 'href']).reset_index(drop=True)
movie.index.name = "movie_id"

genre = pd.DataFrame.from_dict({'genre_id':genre_dict.values(), 
                                'genre':genre_dict.keys()})

movie_genre_map = df[['title', 'genre_id']].copy()
movie_genre_map_dict = {movie_title:movie_id 
                        for movie_id, movie_title 
                        in zip(movie.index, movie['title'])}
movie_genre_map['movie_id'] = [movie_genre_map_dict.get(title) for title in movie_genre_map['title']]
movie_genre_map.drop('title', axis=1, inplace=True)

---
#### Duplicate title names:
Some are different movies, others are the same movie recorded as two seperate entities  
~Notify yts site

In [6]:
duplicate_titles = movie[movie.duplicated(['title','year'], keep=False)].sort_values('title')
duplicate_titles.head()

Unnamed: 0_level_0,title,year,rating,href
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22422,15+ Coming of Age,2017,5.7,https://yts.mx/movies/15-coming-of-age-2017
15042,15+ Coming of Age,2017,5.7,https://yts.mx/movies/15-coming-of-age-2-2017
27822,A Mouse Tale,2012,4.9,https://yts.mx/movies/a-mouse-tale-2-2012
13859,A Mouse Tale,2012,4.9,https://yts.mx/movies/a-mouse-tale-2012
13784,Apartment 407,2016,5.8,https://yts.mx/movies/apartment-407-2016


---
# Create and insert values into new database:

In [7]:
path = Path(os.getcwd())

conn = sql.connect(path/'movie.db')
cursor = conn.cursor()

# movie table
cursor.execute("""
    DROP TABLE IF EXISTS movie;
    """)

cursor.execute("""
    CREATE TABLE IF NOT EXISTS movie(
    movie_id INTEGER PRIMARY KEY,
    title TEXT NOT NULL,
    year INTEGER,
    rating INT NOT NULL,
    href TEXT NOT NULL
    );""")

# genre table
cursor.execute("""
    DROP TABLE IF EXISTS genre;
    """)

cursor.execute("""
    CREATE TABLE IF NOT EXISTS genre(
    genre_id INTEGER PRIMARY KEY,
    genre TEXT NOT NULL
    );""")

# movie_genre_map
cursor.execute("""
    DROP TABLE IF EXISTS movie_genre_map;
    """)

cursor.execute("""
    CREATE TABLE IF NOT EXISTS movie_genre_map(
    movie_id INTEGER NOT NULL,
    genre_id INTEGER NOT NULL,
    FOREIGN KEY(movie_id) REFERENCES movie(movie_id),
    FOREIGN KEY(genre_id) REFERENCES genre(genre_id)
    );""")

# Insert values into new database
movie.to_sql('movie', conn, if_exists='append', index=True)
genre.to_sql('genre', conn, if_exists='append', index=False)
movie_genre_map.to_sql('movie_genre_map', conn, if_exists='append', index=False)

conn.close()