In [16]:
import json

local_config_file = "./local.settings.json"

with open(local_config_file) as f:
    config = json.load(f)

api_path = config["DATA"]["API"]["Path"]
api_key = config["DATA"]["API"]["Key"]
user_id = config["DATA"]["API"]["UserID"]
organization_id = config["DATA"]["API"]["OrganizationID"]
user_name = config["DATA"]["API"]["UserName"]
process_files = config["DATA"]["API"]["Methods"]["ProcessFiles"]
query_files = config["DATA"]["API"]["Methods"]["QueryFiles"]

local_files_folder = config["Local"]["FilesFolder"]

output_csv_file_path = "./output/"
output_csv_file_name = None

#print(local_files_folder)

In [17]:
import os

files_to_upload = []

for root, dirs, files in os.walk(local_files_folder):
    for file in files:
        if file.endswith(".txt"):
            files_to_upload.append(os.path.join(root, file))
            #print(os.path.join(root, file))

In [18]:
from typing import List
    
def get_mime_type(file_path:str):
    mime_type = "application/octet-stream"

    if file_path.endswith(".pdf"):
        mime_type = "application/pdf"
    elif file_path.endswith(".docx"):
        mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    elif file_path.endswith(".doc"):
        mime_type = "application/msword"
    elif file_path.endswith(".txt"):
        mime_type = "text/plain"

    return mime_type

class FileProcessingInputData:
    UserId:str
    OrganizationId:str
    UserName:str
    FileId:str
    ScoreMarginalDocuments:bool
    Tags:List[str]
    Properties:dict

def get_file_data(file_path:str):
    file_data = FileProcessingInputData()
    file_data.UserId = user_id
    file_data.OrganizationId = organization_id
    file_data.UserName = user_name
    file_data.FileId = os.path.basename(file_path)
    file_data.ScoreMarginalDocuments = True

    return file_data

In [19]:
import requests
from io import StringIO
from datetime import datetime

api_process_files_path = f'{api_path}/{process_files}'

process_start = datetime.now()
process_start_str = process_start.strftime("%Y-%m-%dT%H:%M:%S")

print(f"Process started at {process_start_str}")

def upload_document(file_data:FileProcessingInputData, file_path:str):
    
    url = api_process_files_path
    headers = {"Data-Subscription-Key": api_key}

    data_json = StringIO(json.dumps(file_data, default=vars))
    file = open(file_path, "rb").read()
    files = {"json": (None, data_json, "application/json; charset=UTF-8"), file_data.FileId: (file_data.FileId, file, get_mime_type(file_path))}

    response = requests.post(url, headers=headers, files=files)
    print(str(response.status_code) + " " + response.reason + " " + response.text)


Process started at 2023-04-19T15:43:25


In [20]:
for file_path in files_to_upload:
    file_data = get_file_data(file_path)
    upload_document(file_data, file_path)

202 Accepted PROCESSING
202 Accepted PROCESSING
202 Accepted PROCESSING
202 Accepted PROCESSING
202 Accepted PROCESSING
202 Accepted PROCESSING


In [21]:
from python_graphql_client import GraphqlClient

api_query_files_path = f'{api_path}/{query_files}'

# Your GraphQL client needs the api path and the api key
graphql_client = GraphqlClient(endpoint=api_query_files_path, headers={"Data-Subscription-Key":api_key})

In [28]:
graphql_query = "query{mydata(userId: \"" + user_id  + "\", orderBy: {path: \"DateUploaded\", descending: true},where:[{groupedExpressions: [{path: \"hasMyDocumentDeleted\", comparison: equal, value: \"false\"},{ path: \"dateUploaded\", comparison: greaterThanOrEqual, value: \"" + process_start_str + "\" }]}],first:100){totalCount,edges{node{id,fullPath,hasMyDocumentScored,error,documentExtension,dateUploaded,name,scorablePageCount,documentSize,myDocumentScore{dataScore,deceptiveFragmentsCount,deceptiveFragmentsPercentage}}},pageInfo{startCursor,endCursor,hasPreviousPage,hasNextPage}}}"

print (graphql_query)

query{mydata(userId: "4f540842-9d85-4bbd-8271-aa1505b888f7", orderBy: {path: "DateUploaded", descending: true},where:[{groupedExpressions: [{path: "hasMyDocumentDeleted", comparison: equal, value: "false"},{ path: "dateUploaded", comparison: greaterThanOrEqual, value: "2023-04-19T15:43:25" }]}],first:100){totalCount,edges{node{id,fullPath,hasMyDocumentScored,error,documentExtension,dateUploaded,name,scorablePageCount,documentSize,myDocumentScore{dataScore,deceptiveFragmentsCount,deceptiveFragmentsPercentage}}},pageInfo{startCursor,endCursor,hasPreviousPage,hasNextPage}}}


In [29]:
graphql_result = await graphql_client.execute_async(query=graphql_query)

print('ok')

try:
    print(graphql_result["data"]["mydata"]["totalCount"])
except Exception as ex:
    print(graphql_result)
    print(ex)

ContentTypeError: 0, message='Attempt to decode JSON with unexpected mimetype: ', url=URL('https://d-a-t-a.azure-api.net/my-data-files/v1.0')

In [None]:
import pandas as pd
from datetime import datetime

flattened_results = []

for edge in graphql_result["data"]["mydata"]["edges"]:
    flattened:dict = dict()
    flattened["ID"] = edge["node"]["id"]
    flattened["Name"] = edge["node"]["name"]
    flattened["Full Path"] = edge["node"]["fullPath"]
    flattened["Scored"] = edge["node"]["hasMyDocumentScored"]
    flattened["Error"] = edge["node"]["error"]
    flattened["Extension"] = edge["node"]["documentExtension"]
    flattened["Date Uploaded"] = edge["node"]["dateUploaded"]
    flattened["Fiscal Quarter"] = edge["node"]["companyDataScore"]
    flattened["Scorable Page Count"] = edge["node"]["scorablePageCount"]
    flattened["Document Bytes"] = edge["node"]["documentSize"]
    flattened["DATA Score"] = edge["node"]["myDocumentScore"]["dataScore"]
    flattened["Deceptive Fragments Count"] = edge["node"]["myDocumentScore"]["deceptiveFragmentsCount"]
    flattened["Deceptive Fragments Percentage"] = edge["node"]["myDocumentScore"]["deceptiveFragmentsPercentage"]
    flattened_results.append(flattened)

df = pd.read_json(json.dumps(flattened_results))

file_path = (output_csv_file_path + output_csv_file_name) if not output_csv_file_name is None else output_csv_file_path + "output-" + str(datetime.now().timestamp()) + ".csv"

df.to_csv(file_path, encoding='utf-8', index=False)

print("Wrote " + str(len(flattened_results)) + " to csv")