In [1]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from pgvector.psycopg2 import register_vector
import pandas as pd

In [None]:
DB_NAME = "***"
USER = "**"
PASSWORD = "***"
HOST = "localhost"
PORT = "5432"

In [None]:
conn = psycopg2.connect(dbname="***", user=USER, password=PASSWORD, host=HOST, port=PORT)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()

In [4]:
# Create database if it does not exist
cur.execute(f"SELECT 1 FROM pg_database WHERE datname = '{DB_NAME}';")
exists = cur.fetchone()

In [5]:
if not exists:
    cur.execute(f"CREATE DATABASE {DB_NAME};")
    print(f"Database '{DB_NAME}' created.")
else:
    print(f"Database '{DB_NAME}' already exists.")

Database 'Nitec' already exists.


In [6]:
cur.close()
conn.close()

In [7]:
conn = psycopg2.connect(dbname=DB_NAME, user=USER, password=PASSWORD, host=HOST, port=PORT)
cur = conn.cursor()

In [8]:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()

In [9]:
register_vector(conn)

print("Connected to database and pgvector extension is set up!")

Connected to database and pgvector extension is set up!


In [10]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS movies (
        id SERIAL PRIMARY KEY,
        title TEXT,
        plot TEXT,
        genres TEXT,
        movie_cast TEXT,
        fullplot TEXT,
        countries TEXT,
        directors TEXT,
        rated TEXT,
        lastupdated TEXT,
        type TEXT,
        runtime TEXT,
        released BIGINT,
        awards_wins INT,
        awards_nominations INT,
        year INT,
        poster TEXT,
        languages TEXT,
        writers TEXT,
        merged_rating TEXT,
        merged_plot TEXT,
        embeddings VECTOR(384)
    );
""")
conn.commit()
print("Table 'movies' created successfully!")

Table 'movies' created successfully!


In [None]:
# conn.rollback()


In [11]:
df=pd.read_parquet('./processed_mflix_movies_embedded.parquet')
len(df['embeddings'][0])

384

In [12]:
insert_query = """
    INSERT INTO movies (title, plot, genres, movie_cast, fullplot, countries, 
                        directors, rated, lastupdated, type, runtime, released, 
                        awards_wins, awards_nominations, year, poster, languages, 
                        writers, merged_rating, merged_plot, embeddings)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""

In [13]:
df.fillna({
    "runtime.$numberInt": 0,
    "released.$date.$numberLong": 0,
    "awards.wins.$numberInt": 0,
    "awards.nominations.$numberInt": 0,
    "year.$numberInt": 0
}, inplace=True)

In [14]:
df

Unnamed: 0,plot,genres,cast,title,fullplot,countries,directors,rated,lastupdated,type,...,released.$date.$numberLong,awards.wins.$numberInt,awards.nominations.$numberInt,year.$numberInt,poster,languages,writers,merged_rating,merged_plot,embeddings
0,Three men hammer on an anvil and pass a bottle...,[Short],"[Charles Kayser, John Ott]",Blacksmith Scene,A stationary camera looks at a large anvil wit...,[USA],[William K.L. Dickson],UNRATED,2015-08-26 00:03:50.133000000,movie,...,-2418768000000,1,0,1893,,,,6.2,A stationary camera looks at a large anvil wit...,"[-0.06101669743657112, 0.0607670433819294, -0...."
1,A group of bandits stage a brazen train hold-u...,"[Short, Western]","[A.C. Abadie, Gilbert M. 'Broncho Billy' Ander...",The Great Train Robbery,Among the earliest existing films in American ...,[USA],[Edwin S. Porter],TV-G,2015-08-13 00:27:59.177000000,movie,...,-2085523200000,1,0,1903,https://m.media-amazon.com/images/M/MV5BMTU3Nj...,[English],,7.4,Among the earliest existing films in American ...,"[-0.05368781089782715, -0.01406814344227314, -..."
2,"A young boy, opressed by his mother, goes on a...","[Short, Drama, Fantasy]","[Martin Fuller, Mrs. William Bechtel, Walter E...",The Land Beyond the Sunset,"Thanks to the Fresh Air Fund, a slum child esc...",[USA],[Harold M. Shaw],UNRATED,2015-08-29 00:27:45.437000000,movie,...,-1804377600000,1,0,1912,https://m.media-amazon.com/images/M/MV5BMTMzMD...,[English],[Dorothy G. Shore],7.1,"Thanks to the Fresh Air Fund, a slum child esc...","[0.02496330812573433, 0.06163919344544411, -4...."
3,"A greedy tycoon decides, on a whim, to corner ...","[Short, Drama]","[Frank Powell, Grace Henderson, James Kirkwood...",A Corner in Wheat,"A greedy tycoon decides, on a whim, to corner ...",[USA],[D.W. Griffith],G,2015-08-13 00:46:30.660000000,movie,...,-1895097600000,1,0,1909,,[English],,6.6,"A greedy tycoon decides, on a whim, to corner ...","[-0.05462929233908653, 0.0717046782374382, -0...."
4,"Cartoon figures announce, via comic strip ball...","[Animation, Short, Comedy]",[Winsor McCay],"Winsor McCay, the Famous Cartoonist of the N.Y...",Cartoonist Winsor McCay agrees to create a lar...,[USA],"[Winsor McCay, J. Stuart Blackton]",,2015-08-29 01:09:03.030000000,movie,...,-1853539200000,1,0,1911,https://m.media-amazon.com/images/M/MV5BYzg2Nj...,[English],"[Winsor McCay (comic strip ""Little Nemo in Slu...",7.3,Cartoonist Winsor McCay agrees to create a lar...,"[-0.04025116562843323, -0.046722412109375, -0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23442,,,,,,,,,,,...,0,0,0,0,,,,,,[]
23443,,,,,,,,,,,...,0,0,0,0,,,,,,[]
23444,,,,,,,,,,,...,0,0,0,0,,,,,,[]
23445,,,,,,,,,,,...,0,0,0,0,,,,,,[]


In [15]:
df.isna().sum()

plot                             1074
genres                           1147
cast                             1402
title                            1074
fullplot                         1450
countries                        1084
directors                        1329
rated                            9866
lastupdated                      1074
type                             1074
runtime.$numberInt                  0
released.$date.$numberLong          0
awards.wins.$numberInt              0
awards.nominations.$numberInt       0
year.$numberInt                     0
poster                           3645
languages                        1297
writers                          2245
merged_rating                    1607
merged_plot                      1074
embeddings                          0
dtype: int64

In [16]:
df.fillna('-', inplace=True)

In [17]:
data = [
    (
        row["title"], row["plot"], row["genres"], row["cast"], row["fullplot"],
        row["countries"], row["directors"], row["rated"], row["lastupdated"],
        row["type"], row["runtime.$numberInt"], int(row["released.$date.$numberLong"]),
        int(row["awards.wins.$numberInt"]), int(row["awards.nominations.$numberInt"]),
        int(row["year.$numberInt"]), row["poster"], row["languages"], row["writers"],
        row["merged_rating"], row["merged_plot"], row["embeddings"]
    )
    for _, row in df.iterrows()
]

In [128]:
import ast
import numpy as np

def convert_embedding(embedding):
    if isinstance(embedding, str):
        try:
            return np.array(ast.literal_eval(embedding), dtype=np.float32)
        except Exception as e:
            print(f"Error converting string embedding: {embedding} -> {e}")
            return np.zeros(384, dtype=np.float32)
    elif isinstance(embedding, (list, np.ndarray)):
        try:
            return np.array(embedding, dtype=np.float32)
        except Exception as e:
            print(f"Error converting list/ndarray embedding: {embedding} -> {e}")
            return np.zeros(384, dtype=np.float32)
    else:
        print(f"Invalid embedding type: {type(embedding)}")
        return np.zeros(384, dtype=np.float32)

df["embeddings"] = df["embeddings"].apply(convert_embedding)

In [145]:
invalid_embeddings = df[df['embeddings'].apply(lambda x: not isinstance(x, (np.ndarray, list)))]
print(invalid_embeddings[['title', 'embeddings']])

Empty DataFrame
Columns: [title, embeddings]
Index: []


In [90]:
# df.iloc[21299:]

In [18]:
for i, row in df.iterrows():
    try:
        data = (
            row["title"], row["plot"], row["genres"], row["cast"], row["fullplot"],
            row["countries"], row["directors"], row["rated"], row["lastupdated"],
            row["type"], row["runtime.$numberInt"], int(row["released.$date.$numberLong"]),
            int(row["awards.wins.$numberInt"]), int(row["awards.nominations.$numberInt"]),
            int(row["year.$numberInt"]), row["poster"], row["languages"], row["writers"],
            row["merged_rating"], row["merged_plot"], row["embeddings"]
        )
        cur.execute(insert_query, data)
    except Exception as e:
        print(f"Error inserting row {i} (Title: {row['title']}): {e}")
        continue  # Skip to next row

conn.commit()
print("Data inserted successfully!")

Error inserting row 0 (Title: Blacksmith Scene): could not convert string to float: 'Short'
Error inserting row 1 (Title: The Great Train Robbery): could not convert string to float: 'Short'
Error inserting row 2 (Title: The Land Beyond the Sunset): could not convert string to float: 'Short'
Error inserting row 3 (Title: A Corner in Wheat): could not convert string to float: 'Short'
Error inserting row 4 (Title: Winsor McCay, the Famous Cartoonist of the N.Y. Herald and His Moving Comics): could not convert string to float: 'Animation'
Error inserting row 5 (Title: Traffic in Souls): could not convert string to float: 'Crime'
Error inserting row 6 (Title: Gertie the Dinosaur): could not convert string to float: 'Animation'
Error inserting row 7 (Title: In the Land of the Head Hunters): could not convert string to float: 'Drama'
Error inserting row 8 (Title: The Perils of Pauline): could not convert string to float: 'Action'
Error inserting row 9 (Title: The Birth of a Nation): could no

In [138]:
conn.rollback()

In [None]:
df.to_parquet('database_movies.parquet', index=False)