# IMPORTS

In [1]:
import os
import csv
import gzip
import pickle
import shelve
import sqlite3

from tinymongo import TinyMongoClient
from tinymongo.errors import DuplicateKeyError

# DATASETS

In [2]:
IMDB = 'datasets/imdb'


SUBQUERY = """
    SELECT tconst FROM basics
    WHERE isAdult = 0
    AND startYear = 2017
    AND titleType = 'movie'
"""

In [3]:
def read_gzip(root, file):
    path = os.path.join(root, file)

    with gzip.open(path, "rt") as r:
        reader = csv.reader(r, delimiter="\t", quoting=csv.QUOTE_NONE)
        next(reader)  # skip first row

        for row in reader:
            yield [col if col != "\\N" else None for col in row]

# DATABASE

In [4]:
CONN = sqlite3.connect('datasets/imdb.sqlite')

In [5]:
def create_tables(conn):
    print("[*] Setting options ...")
    conn.executescript(
        """
    PRAGMA journal_mode=off;
    PRAGMA synchronous=off;
        """
    )
    conn.commit()

    print("[*] Creating tables ...")
    conn.executescript(
        """
        CREATE TABLE IF NOT EXISTS names (
            nconst TEXT NOT NULL,
            primaryName TEXT NOT NULL,
            birthYear INTEGER,
            deathYear INTEGER,
            primaryProfession TEXT NOT NULL,
            knownForTitles TEXT,
            PRIMARY KEY(nconst)
        );
        CREATE TABLE IF NOT EXISTS basics (
            tconst TEXT NOT NULL,
            titleType TEXT NOT NULL,
            primaryTitle TEXT NOT NULL,
            originalTitle TEXT,
            isAdult BOOLEAN NOT NULL,
            startYear INTEGER,
            endYear INTEGER,
            runtimeMinutes INTEGER,
            genres TEXT,
            PRIMARY KEY(tconst)
        );
        CREATE TABLE IF NOT EXISTS akas (
            titleId TEXT NOT NULL,
            ordering INTEGER NOT NULL,
            title TEX NOT NULL,
            region TEXT,
            language TEXT,
            types TEXT,
            attributes TEXT,
            isOriginalTitle BOOLEAN,
            PRIMARY KEY(titleId, ordering)
        );
        CREATE TABLE IF NOT EXISTS ratings (
            tconst TEXT NOT NULL,
            averageRating REAL NOT NULL,
            numVotes INTEGER,
            PRIMARY KEY(tconst)
        );
        CREATE TABLE IF NOT EXISTS episode (
            tconst TEXT NOT NULL,
            parentTconst TEXT NOT NULL,
            seasonNumber INTEGER,
            episodeNumber INTEGER,
            PRIMARY KEY(tconst, parentTconst)
        );
        CREATE TABLE IF NOT EXISTS writers (
            tconst TEXT NOT NULL,
            nconst TEXT NOT NULL,
            PRIMARY KEY(tconst, nconst)
        );
        CREATE TABLE IF NOT EXISTS directors (
            tconst TEXT NOT NULL,
            nconst TEXT NOT NULL,
            PRIMARY KEY(tconst, nconst)
        );
        CREATE TABLE IF NOT EXISTS principals (
            tconst TEXT NOT NULL,
            ordering INTEGER NOT NULL,
            nconst TEXT NOT NULL,
            category TEXT NOT NULL,
            job TEXT,
            characters TEXT,
            PRIMARY KEY(tconst, ordering)
        );
        """
    )
    conn.commit()


def insert_values(conn, root):
    print("[*] Processing names material ...")
    for row in read_gzip(root, "name.basics.tsv.gz"):
        conn.execute("INSERT INTO names VALUES (?,?,?,?,?,?)", row)
    conn.commit()

    print("[*] Processing basics material ...")
    for row in read_gzip(root, "title.basics.tsv.gz"):
        conn.execute("INSERT INTO basics VALUES (?,?,?,?,?,?,?,?,?)", row)
    conn.commit()

    print("[*] Processing akas material ...")
    for row in read_gzip(root, "title.akas.tsv.gz"):
        conn.execute("INSERT INTO akas VALUES (?,?,?,?,?,?,?,?)", row)
    conn.commit()

    print("[*] Processing ratings material ...")
    for row in read_gzip(root, "title.ratings.tsv.gz"):
        conn.execute("INSERT INTO ratings VALUES (?,?,?)", row)
    conn.commit()

    print("[*] Processing episode material ...")
    for row in read_gzip(root, "title.episode.tsv.gz"):
        conn.execute("INSERT INTO episode VALUES (?,?,?,?)", row)
    conn.commit()

    print("[*] Processing principals material ...")
    for row in read_gzip(root, "title.principals.tsv.gz"):
        conn.execute("INSERT INTO principals VALUES (?,?,?,?,?,?)", row)
    conn.commit()

    print("[*] Processing crew material ...")
    for row in read_gzip(root, "title.crew.tsv.gz"):
        tconst, directors, writers = row

        if writers is not None:
            for writer in writers.split(","):
                conn.execute("INSERT INTO writers VALUES (?,?)", (tconst, writer))

        if directors is not None:
            for director in directors.split(","):
                conn.execute("INSERT INTO directors VALUES (?,?)", (tconst, director))
    conn.commit()

In [6]:
create_tables(CONN)
insert_values(CONN, IMDB)

[*] Setting options ...
[*] Creating tables ...
[*] Processing names material ...
[*] Processing basics material ...
[*] Processing akas material ...
[*] Processing ratings material ...
[*] Processing episode material ...
[*] Processing principals material ...
[*] Processing crew material ...


# P1. SQL REMINDERS

In [7]:
P1 = sqlite3.connect('datasets/p1.sqlite')

In [8]:
def insert_values_p1(inconn, outconn):
    print("[*] Processing names relation ...")
    for row in inconn.execute(
        """
        SELECT * FROM names WHERE nconst IN (
            SELECT nconst FROM writers WHERE tconst IN (%s)
            UNION SELECT nconst FROM directors WHERE tconst IN (%s)
            UNION SELECT nconst FROM principals WHERE tconst IN (%s)
        )
        """
        % (SUBQUERY, SUBQUERY, SUBQUERY)
    ).fetchall():
        outconn.execute("INSERT INTO names VALUES (?,?,?,?,?,?)", row)
    outconn.commit()

    print("[*] Processing basics relation ...")
    for row in inconn.execute(
        """SELECT * FROM basics WHERE tconst IN (%s)""" % (SUBQUERY,)
    ).fetchall():
        outconn.execute("INSERT INTO basics VALUES (?,?,?,?,?,?,?,?,?)", row)
    outconn.commit()

    print("[*] Processing akas relation ...")
    for row in inconn.execute(
        """SELECT * FROM akas WHERE titleId IN (%s)""" % (SUBQUERY,)
    ).fetchall():
        outconn.execute("INSERT INTO akas VALUES (?,?,?,?,?,?,?,?)", row)
    outconn.commit()

    print("[*] Processing ratings relation ...")
    for row in inconn.execute(
        """SELECT * FROM ratings WHERE tconst IN (%s)""" % (SUBQUERY,)
    ).fetchall():
        outconn.execute("INSERT INTO ratings VALUES (?,?,?)", row)
    outconn.commit()

    print("[*] Processing writers relation ...")
    for row in inconn.execute(
        """SELECT * FROM writers WHERE tconst IN (%s)""" % (SUBQUERY)
    ).fetchall():
        outconn.execute("INSERT INTO writers VALUES (?,?)", row)
    outconn.commit()

    print("[*] Processing directors relation ...")
    for row in inconn.execute(
        """SELECT * FROM directors WHERE tconst IN (%s)""" % (SUBQUERY,)
    ).fetchall():
        outconn.execute("INSERT INTO directors VALUES (?,?)", row)
    outconn.commit()

    print("[*] Processing principals relation ...")
    for row in inconn.execute(
        """SELECT * FROM principals WHERE tconst IN (%s)""" % (SUBQUERY,)
    ).fetchall():
        outconn.execute("INSERT INTO principals VALUES (?,?,?,?,?,?)", row)
    outconn.commit()

In [9]:
create_tables(P1)
insert_values_p1(P1, CONN)

[*] Setting options ...
[*] Creating tables ...
[*] Processing names relation ...
[*] Processing basics relation ...
[*] Processing akas relation ...
[*] Processing ratings relation ...
[*] Processing writers relation ...
[*] Processing directors relation ...
[*] Processing principals relation ...


# P2: SQL INTERNALS

In [10]:
P2 = open('datasets/p2.pickle', "wb")

In [11]:
def insert_values_p2(inconn, outconn):
    coll = dict()
    
    print("[*] Processing names relation ...")
    coll["names"] = inconn.execute("""SELECT * FROM names""").fetchall()

    print("[*] Processing basics relation ...")
    coll["basics"] = inconn.execute("""SELECT * FROM basics""").fetchall()

    print("[*] Processing akas relation ...")
    coll["akas"] = inconn.execute("""SELECT * FROM akas""").fetchall()

    print("[*] Processing ratings relation ...")
    coll["ratings"] = inconn.execute("""SELECT * FROM ratings""").fetchall()

    print("[*] Processing writers relation ...")
    coll["writers"] = inconn.execute("""SELECT * FROM writers""").fetchall()

    print("[*] Processing directors relation ...")
    coll["directors"] = inconn.execute("""SELECT * FROM directors""").fetchall()

    print("[*] Processing principals relation ...")
    coll["principals"] = inconn.execute("""SELECT * FROM principals""").fetchall()
    
    pickle.dump(coll, outconn)

In [12]:
insert_values_p2(P1, P2)

[*] Processing names relation ...
[*] Processing basics relation ...
[*] Processing akas relation ...
[*] Processing ratings relation ...
[*] Processing writers relation ...
[*] Processing directors relation ...
[*] Processing principals relation ...


# P3. NOSQL DATABASES

In [13]:
def insert_values_p3_mongo(inconn, outconn):
    coll = list()
    database = outconn['p3']
     
    print("[*] Processing names relation with mongo...")
    for row in inconn.execute(
        """SELECT * FROM names ORDER BY birthYear DESC LIMIT 10000"""
    ).fetchall():
        nconst, primaryName, birthYear, deathYear, primaryProfession, knownForTitles = (
            row
        )

        coll.append(
            {
                "_id": nconst,
                "primaryName": primaryName,
                "birthYear": birthYear,
                "deathYear": deathYear,
                "knownForTitles": knownForTitles.split(",") if knownForTitles else [],
                "primaryProfession": primaryProfession.split(",")
                if primaryProfession
                else [],
            }
        )
    database.names.insert_many(coll, bypass_document_validation=True)


def insert_values_p3_shelve(inconn, outconn):
    print("[*] Processing basics relation with shelve ...")
    for row in inconn.execute("""SELECT * FROM basics""").fetchall():
        tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres = (
            row
        )

        outconn[tconst] = {
            "_id": tconst,
            "titleType": titleType,
            "primaryTitle": primaryTitle,
            "originalTitle": originalTitle,
            "isAdult": isAdult,
            "startYear": startYear,
            "endYear": endYear,
            "runtimeMinutes": runtimeMinutes,
            "genres": genres.split(",") if genres else [],
        }

In [14]:
P3_MONGO = TinyMongoClient('datasets')
P3_SHELVE = shelve.open('datasets/p3.shelve', "c")

insert_values_p3_mongo(CONN, P3_MONGO)
insert_values_p3_shelve(CONN, P3_SHELVE)

[*] Processing names relation with mongo...
[*] Processing basics relation with shelve ...


In [16]:
CONN.close()

P1.close()
P2.close()
P3_MONGO.close()
P3_SHELVE.close()

"DONE"

'DONE'