First install pgvector in pgsql
https://github.com/pgvector/pgvector?tab=readme-ov-file#installation


CREATE EXTENSION IF NOT EXISTS vector;
ALTER TABLE localnews.articles ADD COLUMN vector_2d vector(2);
ALTER TABLE localnews.articles ADD COLUMN vector vector(768);
ALTER TABLE localnews.articles ADD COLUMN similar_matches jsonb;


In [16]:
import psycopg2
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    dbname="decodeMT",
    user="postgres",
    password="rufy100",
)
cursor = conn.cursor()

# Fetch articles
cursor.execute('select entryid, article from localnews.articles order by parseddate')
rows = cursor.fetchall()

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Process each article and update its vector in the database
for row in tqdm(rows):
    entryid, article = row  # Access the correct columns
    encoding = model.encode(article)  # Get the article text

    # Convert numpy array to list and format it as a PostgreSQL array
    encoding_list = encoding.tolist()

    # Update the article's vector column
    cursor.execute('update localnews.articles set vector = %s where entryid = %s', (encoding_list, entryid))

    # Commit changes to the database
    conn.commit()

# Close the connection
cursor.close()
conn.close()


100%|██████████| 503235/503235 [4:53:32<00:00, 28.57it/s]   


In [3]:
import psycopg2
from tqdm import tqdm
import numpy as np
from sklearn.decomposition import PCA
import ast
import umap


# Connect to the PostgreSQL database
conn = psycopg2.connect(
    dbname="decodeMT",
    user="postgres",
    password="rufy100",
)
cursor = conn.cursor()

# Fetch articles
cursor.execute('select entryid, vector from localnews.articles;')
rows = cursor.fetchall()

vectors = [ast.literal_eval(row[1]) for row in rows]
vectors = np.array(vectors, dtype=float)

# pca = PCA(n_components=2)
# reduced_vectors = pca.fit_transform(vectors)
umap_reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
reduced_vectors = umap_reducer.fit_transform(vectors)

# Process each article and update its vector in the database
for row, vector in tqdm(zip(rows, reduced_vectors)):

    cursor.execute('update localnews.articles set vector_2d = %s where entryid = %s', (vector.tolist(), row[0]))

    conn.commit()

# Close the connection
cursor.close()
conn.close()


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
503235it [02:36, 3207.23it/s]


In [1]:
import psycopg2
import logging
from tqdm import tqdm
from multiprocessing import Pool
import json

# Configure logging
logging.basicConfig(level=logging.INFO)

# Database connection parameters
DB_NAME = "decodeMT"
DB_USER = "postgres"
DB_PASSWORD = "rufy100"

# Define number of parallel workers and chunk size
num_workers = 4
chunk_size = 10000

def process_chunk(worker_id):
    # Connect to the database inside the worker function
    conn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD)
    conn_sim = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD)
    cursor = conn.cursor()
    cursor_sim = conn.cursor()

    offset = worker_id * chunk_size
    cursor.execute('SELECT entryid, vector FROM localnews.articles WHERE vector IS NOT NULL LIMIT %s OFFSET %s', (chunk_size, offset))
    rows = cursor.fetchall()

    logging.info(f'Worker {worker_id}: Processing {len(rows)} records.')

    for row in tqdm(rows):
        entryid = row[0]
        vector = row[1]
        
        cursor_sim.execute("""
                    WITH temp_table as (
                    SELECT entryid, vector_2d, vector <=> (SELECT vector FROM localnews.articles WHERE entryid = %s) AS similarity
                    FROM localnews.articles
	                WHERE vector is not null
                    )
                    SELECT entryid, vector_2d, (1-similarity) as "similar"
                    FROM temp_table
                    WHERE (1-similarity)>=0.7
                    AND entryid != %s
                    ORDER BY "similar" DESC
                    """, (row[0], row[0]))

        result = cursor_sim.fetchall()

        processed_result = [
            {
                "to": row[0],
                "xy": ast.literal_eval(row[1]),  # Convert string to actual list
                "val": round(row[2],2)
            }
            for row in result
        ]

        # Update the database
        cursor_sim.execute('UPDATE localnews.articles SET processed_column = %s WHERE entryid = %s', (json.dumps(processed_result), entryid))

    # Commit the changes and close the connection
    conn.commit()
    cursor.close()
    conn.close()

    logging.info(f'Worker {worker_id}: Finished processing.')


# Get the total number of records
with psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD) as conn:
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM localnews.articles WHERE vector IS NOT NULL')
    total_records = cursor.fetchone()[0]
    cursor.close()

logging.info(f'Total records to process: {total_records}')

# Use parallel workers
with Pool(num_workers) as pool:
    pool.map(process_chunk, range(num_workers))



INFO:root:Total records to process: 503235


In [1]:
import psycopg2
from tqdm import tqdm
import numpy as np
from sklearn.decomposition import PCA
import ast
import json


# Connect to the PostgreSQL database
conn = psycopg2.connect(
    dbname="decodeMT",
    user="postgres",
    password="rufy100",
)

conn_sim = psycopg2.connect(
    dbname="decodeMT",
    user="postgres",
    password="rufy100",
)
cursor = conn.cursor()

conn_sim.autocommit = True
cursor_sim = conn_sim.cursor()

# Fetch articles
cursor.execute("select distinct date_part('year', parseddate) from localnews.articles where date_part('year', parseddate)")
years = cursor.fetchall()
for year in years:
    print(int(year[0]))
    cursor.execute("SELECT entryid, vector FROM localnews.articles WHERE date_part('year', parseddate)=%s", (int(year[0]),))
    rows = cursor.fetchall()

    for row in tqdm(rows):
        cursor_sim.execute("""
                    WITH temp_table as (
                    SELECT entryid, vector_2d, vector <=> (SELECT vector FROM localnews.articles WHERE entryid = %s) AS similarity
                    FROM localnews.articles
	                WHERE vector is not null AND date_part('year', parseddate)=%s
                    )
                    SELECT entryid, vector_2d, (1-similarity) as "similar"
                    FROM temp_table
                    WHERE (1-similarity)>=0.7
                    AND entryid != %s
                    ORDER BY "similar" DESC
                    """, (row[0], int(year[0]), row[0]))

        result = cursor_sim.fetchall()

        processed_result = [
            {
                "to": row[0],
                "xy": ast.literal_eval(row[1]),  # Convert string to actual list
                "val": round(row[2],2)
            }
            for row in result
        ]

        cursor_sim.execute('update localnews.articles set similar_articles = %s where entryid = %s', (json.dumps(processed_result), row[0]))
    
#2001
#2002
#2003
#2004
#2005
#2008
#2009
#2011
#2012
#2014
#2019
#2020
#2021



2006


100%|██████████| 21682/21682 [2:37:36<00:00,  2.29it/s]  


2007


100%|██████████| 21355/21355 [2:27:47<00:00,  2.41it/s]  


2010


100%|██████████| 28987/28987 [4:32:34<00:00,  1.77it/s]  


2013


100%|██████████| 26731/26731 [2:44:31<00:00,  2.71it/s]  


2015


100%|██████████| 29506/29506 [3:02:52<00:00,  2.69it/s]  


2016


100%|██████████| 25512/25512 [2:37:16<00:00,  2.70it/s]  


2017


100%|██████████| 23005/23005 [2:17:44<00:00,  2.78it/s]  


2018


100%|██████████| 27147/27147 [3:15:22<00:00,  2.32it/s]  


2022


100%|██████████| 24856/24856 [2:37:06<00:00,  2.64it/s]  
