In [None]:
import re
import os
import pandas as pd
import duckdb
from datetime import datetime

def initialize_metrics_db():
    df = pd.read_csv('merged_file.csv')
    """Create the metrics database if it doesn't exist"""
    if not os.path.exists('metrics.duckdb'):
        with duckdb.connect('metrics.duckdb') as con:
            # Create your table(s) with proper constraints
            con.execute("""
                CREATE TABLE IF NOT EXISTS reviewer_metrics (
                    Name STRING,
                    "MS Number" STRING PRIMARY KEY,  -- Add PRIMARY KEY to prevent duplicates
                    Version STRING,
                    Year INTEGER,
                    Editor STRING,
                    Journal STRING,
                    "Date Invited" DATE,
                    "Date Completed" DATE
                )
            """)
            print("Metrics database initialized.")

            # Insert new data into the table
            con.execute("INSERT INTO reviewer_metrics SELECT * FROM df")

def get_existing_data():
    """Get data from metrics database"""
    # First ensure the database exists
    
    try:
        with duckdb.connect('metrics.duckdb', read_only=True) as con:
            df = con.execute("SELECT * FROM reviewer_metrics").fetchdf()
            print("Data fetched successfully from metrics database.")
            return df
    except Exception as e:
        print(f"Error fetching from metrics database: {e}")
        return pd.DataFrame()

def get_local_data():
    """Get data from reviews database"""
    try:
        with duckdb.connect('reviews.duckdb', read_only=True) as con:
            df = con.execute("SELECT * FROM reviews").fetchdf()
            print("Data fetched successfully from reviews database.")
            return df
    except Exception as e:
        print(f"Error fetching from reviews database: {e}")
        return pd.DataFrame()

def check_for_new_data():
    # Get both sets of data
    reviews_df = get_local_data()
    existing_df = get_existing_data()
    
    # Rename the column from 'MS_Number' to 'MS Number' in reviews_df
    reviews_df = reviews_df.rename(columns={'MS_Number': 'MS Number'})
    
    # Initialize new_man as an empty DataFrame with the same columns as reviews_df
    new_man = pd.DataFrame(columns=reviews_df.columns)
    
    # Get list of existing MS Numbers
    existing_ms_numbers = existing_df['MS Number'].tolist() if not existing_df.empty else []
    
    # Check for new entries
    for index, row in reviews_df.iterrows():
        ms_number = row['MS Number']
        if ms_number not in existing_ms_numbers:
            new_man = pd.concat([new_man, pd.DataFrame([row])], ignore_index=True)
           # print(f"MS Number {ms_number} is new and added to new_man.")
    
    return new_man

def format_data():
    df = check_for_new_data()

    if not df.empty:
        new_df = pd.DataFrame(df)
        # Convert None to NULL for DuckDB
        new_df['Date Invited'] = pd.to_datetime(new_df['Date Invited'])
        new_df['Date Completed'] = pd.to_datetime(new_df['Date Completed'])
        new_df = new_df.sort_values(by='Year', ascending=False)
        
        try:
            # Save to metrics database with conflict handling
            with duckdb.connect('metrics.duckdb') as con:
                # Create temporary table for new data
                con.execute("CREATE TEMP TABLE IF NOT EXISTS temp_metrics AS SELECT * FROM new_df")
                
                # Insert data with conflict handling
                con.execute("""
                    INSERT INTO reviewer_metrics 
                    SELECT * FROM temp_metrics
                """)
                
                # Clean up temporary table
                con.execute("DROP TABLE IF EXISTS temp_metrics")
            
            print(f"Appended {len(new_df)} new rows")
            return new_df
        except Exception as e:
            print(f"Error inserting data: {e}")
            return None
    else:
        print("No new rows to append.")
        return get_existing_data()

def main():
    
    combined_df = format_data()
    
    if combined_df is not None:
        combined_df.to_csv('combined_df.csv', index=False)
        print("Data successfully combined and saved.")
    else:
        print("No changes made to the data.")

if __name__ == "__main__":
    main()


In [15]:
import re
import os
import pandas as pd
import duckdb
from datetime import datetime

def initialize_metrics_db():
    # Load the CSV file and ensure date columns are parsed correctly
    df = pd.read_csv('merged_file.csv', parse_dates=['Date Invited', 'Date Completed'])

    # Debugging: Print the first few rows to verify the data
    print("CSV Data Sample:")
    print(df.head())

    # Connect to DuckDB
    con = duckdb.connect('metrics.duckdb')

    # Drop the existing table if it exists
    con.execute("DROP TABLE IF EXISTS reviewer_metrics")

    # Create the table with the correct schema
    con.execute("""
    CREATE TABLE reviewer_metrics (
        Name STRING,
        "MS Number" STRING,
        Version STRING,
        Year INTEGER,
        Editor STRING,
        Journal STRING,
        "Date Invited" DATE,
        "Date Completed" STRING
    )
    """)

    # Insert data into the table
    con.execute("INSERT INTO reviewer_metrics SELECT * FROM df")

    con.close()
    print("Data imported successfully into DuckDB.")

def fetch_data():
    # Connect to DuckDB and fetch data
    con = duckdb.connect('metrics.duckdb')
    df = con.execute("SELECT * FROM reviewer_metrics").fetchdf()
    con.close()

    # Debugging: Print the first few rows to verify the data
    print("Fetched Data Sample:")
    print(df.head())

    return df

def main():
    initialize_metrics_db()
    combined_df = fetch_data()

    # Save the combined data to a CSV file
    combined_df.to_csv('combined_df.csv', index=False)
    print("Data successfully combined and saved.")

if __name__ == "__main__":
    main()

CSV Data Sample:
                                             Name             MS Number  \
0       EUONCO-D-24-00985R2 (Dunn) 2025-03-24.pdf   EUONCO-D-24-00985R2   
1  EUONCO-D-24-00971R1 (MacLennan) 2025-02-02.pdf   EUONCO-D-24-00971R1   
2   EUFOCUS-D-24-00753R3 (Vickers) 2025-01-24.pdf  EUFOCUS-D-24-00753R3   
3   EUFOCUS-D-24-00842R1 (Vickers) 2025-01-24.pdf  EUFOCUS-D-24-00842R1   
4      EURUROL-D-25-00025R0 (Dunn) 2025-01-23.pdf  EURUROL-D-25-00025R0   

  Version  Year     Editor  Journal Date Invited Date Completed  
0      R2  2025       Dunn   EUONCO   2025-03-11     2025-03-11  
1      R1  2025  MacLennan   EUONCO   2025-01-21     2025-01-22  
2      R3  2025    Vickers  EUFOCUS   2025-01-15     2025-01-21  
3      R1  2025    Vickers  EUFOCUS   2025-01-15     2025-01-21  
4      R0  2025       Dunn  EURUROL   2025-01-17     2025-01-19  
Data imported successfully into DuckDB.
Fetched Data Sample:
                                             Name             MS Number  \
