In [134]:
import pandas as pd
import sqlite3
import json
import os
import select
from sqlite3 import connect
from pandas import read_sql_query
from json import load 
from pandas import DataFrame
from pandas import read_csv
from rdflib import Graph, URIRef, Literal, RDF 
from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore

In [84]:
class Handler(object): #this is the first class, all the others derive from this one 

    #creating the class 
    def __init__(self, dbPathOrUrl : str):
        self.dbPathOrUrl = dbPathOrUrl

    #creating the methods 
    def getDbPathOrUrl(self): 
        return self.dbPathOrUrl 

    def setDbPathOrUrl(self, pathOrUrl : str): #: boolean 
        self.dbPathOrUrl = pathOrUrl
        return True

class UploadHandler(Handler):

    def pushDataToDb(self, path: str):  #self implied 
        if path.lower().endswith(".csv"): 
            handler = JournalUploadHandler(self.dbPathOrUrl)
            return handler.journalUpload(path) #calling the method after I called the subclass
        elif path.lower().endswith(".json"): 
            handler = CategoryUploadHandler(self.dbPathOrUrl)
            return handler.categoryUpload(path)
        else: 
            return False 

#first case: the path is of the relational database the json file

In [86]:
#codice da una sola tabella
class CategoryUploadHandler(UploadHandler): 
    # TODO
    # def __init__(self):
    #     self.dbPathOrUrl = ""

    def pushDataToDb(self, path: str): 
        
        #creating the database 
        with connect(self.dbPathOrUrl) as con: 
            con.commit() #commit the current transactions to the database  

        with open(path, "r", encoding="utf-8") as c: 
            json_data = json.load(c) #reading the file 

            identifier_list = []

            category_mapping_dict = {} #using it to keep track of what we have
            categories_list = []

            area_mapping_dict = {}
            area_list = []

            #internal identifier of all the items 
            for idx, item in enumerate(json_data): 
                item_internal_id = ("item_" + str(idx)) 
            
                #1. creating internal ids for each element: identifiers 
                identifiers = item.get("identifiers", []) #selecting the identifiers and using this method to retrive information from a dictionary and take into consideration the possibility that there is not an id 

                #iterating through the identifiers indise the bigger loop of items
                for idx, row in enumerate(identifiers): #i use the iteration because there are more than one in some cases 
                    identifiers_internal_id = (item_internal_id) + ("_identifier_internal_id_") + str(idx) #thi is useful even if redundant because the iteration makes the indexes always restart, so we have many internal id which are 0 or 1 


                    identifier_list.append({
                            "item_internal_id": item_internal_id,
                            # "identifier_internal_id": identifiers_internal_id,
                            "identifiers": row #which is the single identifier 
                            })  #associating the data, with the internal id of the single category but also to the identifies of the whole item so that it's easier to query 

                #2. creating internal ids for the categories, this is trickier because they have more than one value and they can have same id
                #i have to iterate thourg everything but check if the "id" is the same, so it's useful to use a dictionary 
                categories = item.get("categories", []) #especially for category, quartile and area, that in the UML are noted as optional ([0...*]) it's better to do it this way 

                for row in categories: #appunto per me, scrivere cat_id = category["id"] non ha senso perchè category è una lista di un dizionario, io devo internere come dizionario il singolo item 
                    cat_id = row.get("id")

                    if cat_id not in category_mapping_dict: #checking if the category is not already in the dictionary 
                        category_id_internal_id = ("category_id_") + str(len(category_mapping_dict))
                        category_mapping_dict[cat_id] = (category_id_internal_id)
                    else: 
                        category_id_internal_id = category_mapping_dict[cat_id] #if it's already inside the dict consider the original id 

                    #checking for the quartile, because it's optional in the UML
                    quartile = row.get("quartile", "")

                    categories_list.append({
                        "item_internal_id": item_internal_id,
                        # "category_internal_id" : category_id_internal_id,
                        "category_id": cat_id,
                        "category_quartile": quartile
                    })
                
            
                #3. creating internal ids for areas, this is the same but without any more value 
                areas = item.get("areas", [])

                for area in areas: 
                    if area not in area_mapping_dict: 
                        area_id = (("area_id_") + str(len(area_mapping_dict)))
                        area_mapping_dict[area] = area_id
                    else: 
                        area_id = area_mapping_dict[area]
                
                    area_list.append({
                        "item_internal_id": item_internal_id, 
                        # "area_internal_id": area_id,
                        "area": area
                    })
            
            
            #converting the data in dataframes 
            identifiers_df = pd.DataFrame(identifier_list)
            categories_df = pd.DataFrame(categories_list)
            areas_df = pd.DataFrame(area_list)
            # unirle
            merge_1 = pd.merge(identifiers_df, categories_df, left_on='item_internal_id', right_on='item_internal_id')
            merge_2 = pd.merge(merge_1, areas_df, left_on='item_internal_id', right_on='item_internal_id')
            

        with connect(self.dbPathOrUrl) as con:
            # identifiers_df.to_sql("identifiers", con, if_exists="replace", index=False)
            # categories_df.to_sql("categories", con, if_exists="replace", index=False)
            # areas_df.to_sql("areas", con, if_exists="replace", index=False)
            merge_2.to_sql('info', con, if_exists='replace', index=False)

                # TODO: why not 'con.commit()'
            
#second case: the path is the one of a graph database, the csv file


In [87]:
class QueryHandler:
    def __init__(self):
        self.dbPathOrUrl = ""

    def getDbPathOrUrl(self):
        return self.dbPathOrUrl

    def setDbPathOrUrl(self, path):
        self.dbPathOrUrl = path

    def getById(self, id):
        """
        Questo metodo cerca un'entità identificabile per ID nel database.
        """
        raise NotImplementedError("Questo metodo deve essere implementato nelle sottoclassi.")
    

In [131]:
class CategoryQueryHandler:

    def __init__(self, dbPathOrUrl: str):
    
        self.dbPathOrUrl = dbPathOrUrl

    def _execute_query(self, query: str, params: tuple = None):
      
        try:
            with sqlite3.connect(self.dbPathOrUrl) as con:
                if params:
                    df = pd.read_sql_query(query, con, params=params)
                else:
                    df = pd.read_sql_query(query, con)
                return df
        except sqlite3.Error as e:
            if "no such table: info" in str(e):
                 print(f"Database error: The table 'info' does not exist in {self.dbPathOrUrl}")
            else:
                 print(f"Database error during query execution: {e}")
           
            # This part might need adjustment based on expected columns for each method caller
            return pd.DataFrame()
        except Exception as e:
            print(f"An unexpected error occurred during query execution: {e}")
            return pd.DataFrame()

    def getAllCategories(self):
        
        # Select distinct non-null category IDs from the 'info' table
        query = "SELECT DISTINCT category_id FROM info WHERE category_id IS NOT NULL"
        df = self._execute_query(query)
        # Ensure correct column name if df is empty
        if df.empty and 'category_id' not in df.columns:
             return pd.DataFrame(columns=['category_id'])
        return df

    def getAllAreas(self):
      
        # Select distinct non-null areas from the 'info' table
        query = "SELECT DISTINCT area FROM info WHERE area IS NOT NULL"
        df = self._execute_query(query)
        # Ensure correct column name if df is empty
        if df.empty and 'area' not in df.columns:
             return pd.DataFrame(columns=['area'])
        return df

    def getCategoriesWithQuartile(self, quartiles: list):
        
        if not quartiles:
            # Return all distinct category/quartile pairs
            query = "SELECT DISTINCT category_id, category_quartile FROM info WHERE category_id IS NOT NULL"
            df = self._execute_query(query)
        else:
            # Build the WHERE clause carefully to handle different types and NULL
            conditions = []
            params = []
            has_null_quartile_request = False

            for q in quartiles:
                if q is None or pd.isna(q):
                    has_null_quartile_request = True
                else:
                    # Add placeholder for non-null quartiles
                    conditions.append("category_quartile = ?")
                     # Convert to string for consistent comparison if quartiles can be numbers/strings
                    params.append(str(q))

            where_clause = ""
            if conditions:
                where_clause = "(" + " OR ".join(conditions) + ")"

            if has_null_quartile_request:
                if where_clause:
                    where_clause += " OR category_quartile IS NULL"
                else:
                    where_clause = "category_quartile IS NULL"

            # Construct the final query only if there's a valid where_clause
            if not where_clause:
                 # This case should ideally not be reached if quartiles list is not empty,
                 # but as a safeguard return empty df matching schema.
                 return pd.DataFrame(columns=['category_id', 'category_quartile'])

            query = f"""
                SELECT DISTINCT category_id, category_quartile
                FROM info
                WHERE category_id IS NOT NULL AND ({where_clause})
            """
            df = self._execute_query(query, tuple(params))

        # Ensure correct column names if df is empty
        if df.empty and ('category_id' not in df.columns or 'category_quartile' not in df.columns):
             return pd.DataFrame(columns=['category_id', 'category_quartile'])
        return df


    def getCategoriesAssignedToAreas(self, areas: list):
        
        if not areas:
            # If no areas specified, get all distinct categories/quartiles
            query = "SELECT DISTINCT category_id, category_quartile FROM info WHERE category_id IS NOT NULL"
            df = self._execute_query(query)
        else:
            # Create placeholders for the areas in the IN clause
            placeholders = ','.join('?' for _ in areas)
            query = f"""
                SELECT DISTINCT category_id, category_quartile
                FROM info
                WHERE area IN ({placeholders}) AND category_id IS NOT NULL
            """
            df = self._execute_query(query, tuple(areas))

        # Ensure correct column names if df is empty
        if df.empty and ('category_id' not in df.columns or 'category_quartile' not in df.columns):
             return pd.DataFrame(columns=['category_id', 'category_quartile'])
        return df

    def getAreasAssignedToCategories(self, categories: list) -> pd.DataFrame:
        
        if not categories:
            # If no categories specified, get all distinct areas
            query = "SELECT DISTINCT area FROM info WHERE area IS NOT NULL"
            df = self._execute_query(query)
        else:
            # Create placeholders for the categories in the IN clause
            placeholders = ','.join('?' for _ in categories)
            query = f"""
                SELECT DISTINCT area
                FROM info
                WHERE category_id IN ({placeholders}) AND area IS NOT NULL
            """
            df = self._execute_query(query, tuple(categories))

        # Ensure correct column name if df is empty
        if df.empty and 'area' not in df.columns:
             return pd.DataFrame(columns=['area'])
        return df


In [97]:
handler = CategoryUploadHandler("dummy_relational.db")
handler.pushDataToDb("scimago.json")


In [133]:
query = CategoryQueryHandler("dummy_relational.db")
query.getAllAreas()

Unnamed: 0,area
0,Medicine
1,Computer Science
2,"Biochemistry, Genetics and Molecular Biology"
3,"Economics, Econometrics and Finance"
4,"Pharmacology, Toxicology and Pharmaceutics"
5,Energy
6,Materials Science
7,"Business, Management and Accounting"
8,Multidisciplinary
9,Chemical Engineering
