In [3]:
import os
import csv
import math
import random
import requests
import constants as const 
from authentication import AuthObject
from CitesphereConnector import CitesphereConnector

In [4]:
auth_object = AuthObject()
auth_object.authType  = 'oauth'
auth_object.access_token = "f5f7e899-30d3-4531-8b2e-8009e9969ed4"
citesphere_api_url = const.CITESPHERE_API_URL
connector = CitesphereConnector(citesphere_api_url, auth_object)
#default max number of items displayed on a collection items page in citesphere
max_size=const.MAX_SIZE

def get_file(file_id:str)-> str:
    return const.GILES_URL+"{}/content".format(file_id)

"""
groups is a list of dictionaries contains information about the different groups of collections 
with following properties:

{'id': ,
  'name': '',
  'version': ,
  'created': <date>,
  'lastModified': <date>,
  'numItems': ,
  'owner': ,
  'type': '',
  'description': '',
  'url': '',
  'syncInfo': }
  
"""

In [5]:
groups=connector.get_groups()

"""

this following function "download_files(folder_path:str,ids:set, citesphere_token:str) -> list":

has following parameters:

1. folder_path:str = location of the folder where the user wants to save the downloaded files

2. ids:set = set of tuples, containing pair of file ids and file names

3. citesphere_token:str = token to access data from giles

function:
this function iterates throught the ids list of tuples, plugging the file id to the giles url string to get the
file url from giles for downloading the file to the dedicated folder.

Append the path to the path_list.

if the response of the get request is of level 200
it writes the file with the appriate content and save it to the download folder and *returns* the list of the path
for the downloaded files, so that it can be saved to the csv file's path field.

"""

In [16]:
#download files from the collection items
def download_files(folder_path:str,ids:set, citesphere_token:str) -> list:
    
    #stores paths to downloaded files
    path_list = []
    
    #iterating through the ids list
    for (file_id, file_name) in ids:
        
        # getting the file ur using giles file id
        giles_url = get_file(file_id)
        os.makedirs(folder_path, exist_ok=True)
        filename = os.path.join(folder_path, f"{file_name}")
        
        #append the path of the saved file to the folder
        path_list.append(filename)
        
        #header for get request
        headers = {
            "Authorization": f"Bearer {citesphere_token}",
            "Content-Type": "application/pdf;charset=UTF-8"
            }
        response = requests.get(giles_url, headers=headers)
        
        #saving the file if retrieved successfully
        if response.status_code == 200:
            with open(filename, "wb") as file:
                file.write(response.content)
    return path_list 

"""

The following function "get_set(items_list: list) -> set:" 
Get a set of file IDs and names from the collection items.

Parameters:
items_list (list): List of items containing information about files.

Returns:
set: Set of tuples containing pairs of file IDs and file names.

Functionality:
This function iterates through the 'items_list' and extracts file IDs and names from different fields such as
'uploadedFile', 'extractedText', 'pages', 'image', 'text', 'ocr', and 'additionalFiles'. It creates a set of
tuples with file IDs and names and returns the resulting set.

"""

"""# Get a set of file IDs and names from the collection items
def get_set(items_list: list) -> set:
    ids = set()

    for values in items_list:
        # Getting the file IDs in uploadedFile, extractedText, pages, image, text, ocr, additionalFiles
        ids.add((values["uploadedFile"]["id"], values["uploadedFile"]["filename"]))

        # Check if extractedText is present and not equal to "None"
        if values["extractedText"] and values["extractedText"] != "None":
            ids.add((values["extractedText"]["id"], values["extractedText"]["filename"]))

        # Check if pages is present and not equal to "None"
        if values["pages"] and values["pages"] != "None":
            for value in values["pages"]:
                ids.add((value["image"]["id"], value["image"]["filename"]))
                ids.add((value["text"]["id"], value["text"]["filename"]))
                ids.add((value["ocr"]["id"], value["ocr"]["filename"]))

                # Iterate through additionalFiles in pages
                for file in value["additionalFiles"]:
                    ids.add((file["id"], file["filename"]))

    return ids"""

"""
The following function write_to_csv(csv_name: str, item: list, flag: int) -> None:
Write metadata to a CSV file.

Parameters:
1. csv_name (str): Name of the CSV file.
2. item (list): List containing metadata and file path.
3. flag (int): Flag to determine whether to write the fields at the top of the file.

Returns:
None

Functionality:
This function writes metadata to a CSV file. It uses a flag to decide whether to write the fields at the top of
the file. If the flag is 0, it writes the fields; otherwise, it writes the values. It appends the data to the
existing CSV file.
"""


In [17]:
# Create the CSV file with all the metadata and file path of the downloaded files
def write_to_csv(csv_name: str, item: list, flag: int) -> None:

    with open(csv_name, 'a', newline='') as file:
        writer = csv.writer(file)

        # Check if it's the first time writing to the file
        if flag == 0:
            fields = list(item.keys())
            writer.writerow(fields)

        # Write the values to the CSV file
        writer.writerow(list(item.values()))


"""
The following function add_to_csv(csv_name: str, folder_name: str, items: list, csv_dict: dict, flag: int) -> int:
Add items to a dictionary object 'csv_dict' that contains all the collection items, with metadata and
downloaded file path.

Parameters:
1. csv_name (str): Name of the CSV file.
2. folder_name (str): Name of the folder where the files will be downloaded.
3. items (list): List of collection items.
4. csv_dict (dict): Dictionary containing collection items with metadata and file paths.
5. flag (int): Flag to determine whether to write the fields at the top of the file.

Returns:
int: Updated flag value.

Functionality:
This function iterates through the collection items in 'items'. For each item, it checks if the item's key
already exists in 'csv_dict'. If not, it adds the item to 'csv_dict' with a unique key and sets the 'paths'
property to an empty string. If 'gilesUploads' are present in the item, it gets the set of file IDs and names,
downloads the files, and updates the 'paths' property accordingly. The function then writes the item to the
CSV file using the 'write_to_csv' function and updates the flag. Finally, it returns the updated flag value.
"""

In [24]:
def add_to_csv(csv_name: str, folder_name: str, items: list, csv_dict: dict, flag: int) -> int:

    for item in items["items"]:
        if item["key"] in csv_dict:
            continue
        item["paths"] = ""

        # Check if gilesUploads are present
        if item["gilesUploads"]:
            giles_ids = set()

            items_list = item["gilesUploads"]

            for values in items_list:
                # Getting the file IDs in uploadedFile, extractedText, pages, image, text, ocr, additionalFiles
                if values["uploadedFile"] and values["uploadedFile"] != "None":
                    giles_ids.add((values["uploadedFile"]["id"], values["uploadedFile"]["filename"]))

                # Check if extractedText is present and not equal to "None"
                if values["extractedText"] and values["extractedText"] != "None":
                    giles_ids.add((values["extractedText"]["id"], values["extractedText"]["filename"]))

                # Check if pages is present and not equal to "None"
                if values["pages"] and values["pages"] != "None":
                    for value in values["pages"]:
                        giles_ids.add((value["image"]["id"], value["image"]["filename"]))
                        giles_ids.add((value["text"]["id"], value["text"]["filename"]))
                        giles_ids.add((value["ocr"]["id"], value["ocr"]["filename"]))

                        # Iterate through additionalFiles in pages
                        for file in value["additionalFiles"]:
                            giles_ids.add((file["id"], file["filename"]))

            if giles_ids:
                # store paths of the downloaded files to the path attribute
                item["paths"] = download_files(folder_name, giles_ids, auth_object.access_token)

        # Add the item to csv_dict and write it to the CSV file
        csv_dict[item["key"]] = item
        write_to_csv(csv_name, item, flag)
        flag = 1

    return flag


"""
The following function downloads and generates a CSV file with all the meta data
Downloads and generates a CSV file containing all the group items information.

Parameters:
1. csv_name (str): Name of the CSV file.
2. folder_path (str): Name of the folder where the files will be downloaded.
3. groups (list): List of group information.
4. connector: Connector object to interact with the data source.
5. max_size (int): Maximum size for each page of items.

Returns:
dict: Dictionary containing collection items with metadata and file paths.

Functionality:
This function iterates through the 'groups'. For each group, it gets the collections and then retrieves items
from each collection. It calls the 'add_to_csv' function to add the items to a dictionary 'csv_dict' and write
them to the CSV file. The function returns 'csv_dict' containing all the collection items with metadata and
file paths.

"""

In [25]:
# Downloads and generates a CSV file containing all the group items information
def process_groups(csv_name: str, folder_path: str, groups: list, connector, max_size: int) -> dict:

    csv_dict = {}
    flag = 0
    
    #Iterate over the groups
    for group in groups:
        group_id = group["id"]
        collections = connector.get_collections(group_id)
        
        #Iterate over the collections in the respective group
        for collection in collections["collections"]:
            num_pages = math.ceil(collection["numberOfItems"] / max_size)
            
            #Iterating over the pages
            for page in range(1, num_pages + 1):
                items = connector.get_collection_items(group_id, collection["key"], page)
                flag = add_to_csv(csv_name, folder_path, items, csv_dict, flag)

    return csv_dict


In [None]:
csv_filename = "citesphere_csv.csv"

folder_path  = "Files"

process_groups(csv_filename,folder_path,groups,connector,max_size)
