In [1]:
import pandas as pd

import os
from pathlib import Path

import sqlite3 as sql

### Establish Connection to uncleaned (raw) data:

In [2]:
file_path = Path(os.getcwd()).parent.absolute()
raw_data_path = file_path /'yify_collect_data'/'data'/'movie.db'

raw_conn = sql.connect(raw_data_path)
raw_cursor = raw_conn.cursor()

raw_db = pd.read_sql("SELECT * from movie", raw_conn)

---
# Create new cleaned database:

In [3]:
path = Path(os.getcwd())

conn = sql.connect(path/'movie.db')
cursor = conn.cursor()

In [4]:
# Movies table
cursor.execute("""
    DROP TABLE IF EXISTS movies;
    """)

cursor.execute("""
    CREATE TABLE IF NOT EXISTS movies(
    record INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    year INTEGER,
    genre_id TEXT NOT NULL,
    rating INT NOT NULL,
    href TEXT NOT NULL,
    FOREIGN KEY(genre_id) REFERENCES genres(genre_id)
    );""")

<sqlite3.Cursor at 0x7fa4ec8e8ea0>

In [5]:
# Genres table
cursor.execute("""
    DROP TABLE IF EXISTS genres;
    """)

cursor.execute("""
    CREATE TABLE IF NOT EXISTS genres(
    genre TEXT NOT NULL,
    genre_id INTEGER PRIMARY KEY AUTOINCREMENT,
    FOREIGN KEY(genre_id) REFERENCES movies(genre_id)
    );""")


<sqlite3.Cursor at 0x7fa4ec8e8ea0>

--- 
### Get the unique genres from column

In [6]:
def get_unique_genre(genre_column):
    unique_genre = []
    for genres in genre_column:
        genres = genres.split('/')
        for genre in genres:
            cleaned_genre = genre.strip()
            if cleaned_genre not in unique_genre:
                unique_genre.append(cleaned_genre)
    return sorted(unique_genre)
 
    
unique_genre = get_unique_genre(raw_db['genre'])  

### Inserting clean values into database

In [7]:
unique_genre_clean = [genre.replace('-', ' ').lower() for genre in unique_genre]

for genre in unique_genre_clean:
    cursor.execute(f"""
        INSERT INTO genres(genre)
        VALUES("{genre}");""")
    
conn.commit()

In [8]:
### Better name for function
def remove_row(data, null_count=0):
    """Remove row, if contains the same amount or more null values than the null count"""
    data = data.copy(deep=True)
    drop_rows = data[data.isnull().sum(axis=1) >= null_count].index
    return data.drop(index=drop_rows)

---
### Cleaning and structuring DataBase

In [11]:
df = raw_db.copy()

# Convert genre to a list of strings
df.genre = [genre.strip().split('/') for genre in df.genre]

# Expand genre such that each row has one genre onlye
df = df.explode('genre')

df = df.replace('None', None)
df = remove_row(df, null_count=2)

# Convert genre to genre_id
## Add a dictionary to map genre -> genre_id

Unnamed: 0,title,year,genre,rating,href
0,The Founding of a Republic,2009,Action,4.9,https://yts.mx/movies/the-founding-of-a-republ...
0,The Founding of a Republic,2009,Drama,4.9,https://yts.mx/movies/the-founding-of-a-republ...
0,The Founding of a Republic,2009,History,4.9,https://yts.mx/movies/the-founding-of-a-republ...
0,The Founding of a Republic,2009,War,4.9,https://yts.mx/movies/the-founding-of-a-republ...
1,Nude Tuesday,2022,Comedy,6.7,https://yts.mx/movies/nude-tuesday-2022
...,...,...,...,...,...
43924,The Laramie Project,2002,Drama,7.1,https://yts.mx/movies/the-laramie-project-2002
43924,The Laramie Project,2002,History,7.1,https://yts.mx/movies/the-laramie-project-2002
43925,The Shadow Line,1976,Drama,6.2,https://yts.mx/movies/the-shadow-line-1976
43926,Incantation,2022,Horror,6.2,https://yts.mx/movies/incantation-2022
