### Upload input data to MinIO

In [15]:
import os
import boto3

s3 = boto3.resource('s3',
                    #endpoint_url='http://127.0.0.1:61403',
                    #endpoint_url='http://127.0.0.1:30080',
                    endpoint_url='https://minio-api.digitalhub-dev.smartcommunitylab.it/',
                    aws_access_key_id='minio',
                    aws_secret_access_key='minio123',
                    aws_session_token=None,
                    config=boto3.session.Config(signature_version='s3v4'))

In [None]:
ipzs_bucket = s3.Bucket("ipzs")
input_folder = "/Users/erica/document-classification/input-folder/ipzs"
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"]

for year in years:
    for root, subfolders, files in os.walk(input_folder + "/" + year):
        for item in files:
            if item.endswith(".json") or item.endswith(".csv"):
                fileNamePath = str(os.path.join(root,item))
                ipzs_bucket.upload_file(fileNamePath, fileNamePath.replace(input_folder + "/", ""))

In [9]:
for root, subfolders, files in os.walk(input_folder):
    for item in files:
        if item.startswith(".DS"):
            fileNamePath = str(os.path.join(root,item))
            print(fileNamePath)
            os.remove(fileNamePath)

### Configure MLRun

In [10]:
import mlrun

In [None]:
#set path of MLRun API running on Minikube
#mlrun.set_environment("http://127.0.0.1:30070")
#set path of MLRun API running ok Kubernetes
mlrun.set_environment("https://mlrun-api.digitalhub-dev.smartcommunitylab.it", username="digitalhub-dev")

In [33]:
mlrun.get_secret_or_env("MLRUN_DBPATH")

'http://127.0.0.1:30070'

### Create a project

In [34]:
#project = mlrun.new_project("document-classification", context="./", overwrite=True, init_git=False, user_project=False)
project = mlrun.get_or_create_project("document-classification", context="./", init_git=False, user_project=False)

> 2023-03-30 11:35:45,930 [info] loaded project document-classification from MLRun DB


### Register the pre-processing function and run it

In [5]:
preproc_fn = project.set_function(
    name="pre-processing",
    func="01-preprocessing_handlers.py",
    handler="parse_ipzs",
    kind="job",
    image="mlrun/mlrun", #includes sklearn, pandas, numpy
    #requirements=[] #list or path to a requirements.txt
)

In [6]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [6]:
preproc_run = project.run_function(
    "pre-processing",
    #local=False,
    params={"bucket_name": "ipzs", "idPrefix": "ipzs-", "limit": 10, "max_documents": 250},
    outputs=["preprocessed_data"]
)

> 2023-03-29 14:49:04,981 [info] starting run pre-processing-parse_ipzs uid=ffd459a885e04e1cb0859b40332e18e9 DB=http://127.0.0.1:30070
> 2023-03-29 14:49:05,342 [info] Job is running in the background, pod: pre-processing-parse-ipzs-5r99f
INFO:root:s3_endpoint_url: http://minio.mlrun.svc.cluster.local:9000
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
ERROR:root:20210623_21G00103 not in index
ERROR:root:20210623_21G00097 not in index
ERROR:root:20210623_21G00108 not in index
ERROR:root:20210622_21G00096 not in index
ERROR:root:20210622_21G00107 not in index
ERROR:root:20210621_21G00104 not in index
ERROR:root:20210618_21G00095 not in index
ERROR:root:20210617_21G00092 not in index
ERROR:root:20210616_21G00091 not in index
ERROR:root:20210615_21G00094 not in index
ERROR:root:20210614_21G00098 not in index
ERROR:root:20210611_21G00089 not in index
ERROR:root:20210609_21G00093 not in index
ERROR:

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...2e18e9,0,Mar 29 12:49:11,completed,pre-processing-parse_ipzs,kind=jobowner=ericamlrun/client_version=1.2.1host=pre-processing-parse-ipzs-5r99f,,bucket_name=ipzsidPrefix=ipzs-limit=10max_documents=250,,preprocessed_data





> 2023-03-29 14:49:18,544 [info] run executed, status=completed


In [7]:
preproc_run.outputs["preprocessed_data"]

's3://mlrun/projects/document-classification/artifacts/pre-processing-parse_ipzs/0/preprocessed_data.json'

### Register the parsing function and run it

**NOTE**: building/auto-building images does not work on ARM because the resulting images are for ARM but AMD images are required. Manually building images and loading them on DockerHub is the current workaround.

In [13]:
#image has been built with:
# docker build -t classification-parsing:latest -<<EOF
# FROM mlrun/mlrun:1.2.1
# RUN pip install tqdm==4.61.1
# RUN pip install requests==2.25.1
# RUN pip install stanza==1.4.2
# EOF
parsing_fn = project.set_function(
    name="parsing",
    func="02-parsing_handlers.py",
    handler="parse",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

#for VM with autobuild
# parsing_fn = project.set_function(
#     name="parsing",
#     func="02-parsing_handlers.py",
#     handler="parse",
#     kind="job",
#     image="mlrun/mlrun",
#     requirements=["tqdm==4.61.1", "requests==2.25.1", "stanza==1.4.2"] #list or path to a requirements.txt
# )

In [14]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f8e594c7d90>

In [15]:
parsing_run = project.run_function(
    "parsing",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"]},
    params={"tint_url": None}
)

> 2023-03-29 15:16:22,902 [info] starting run parsing-parse uid=3950c35f608e4a3da05da02d357ddbef DB=http://127.0.0.1:30070
> 2023-03-29 15:16:23,138 [info] Job is running in the background, pod: parsing-parse-n7q2m
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 79.5MB/s]                    
Downloading https://huggingface.co/stanfordnlp/stanza-it/resolve/v1.4.1/models/tokenize/combined.pt: 100%|██████████| 649k/649k [00:00<00:00, 20.7MB/s]
Downloading https://huggingface.co/stanfordnlp/stanza-it/resolve/v1.4.1/models/mwt/combined.pt: 100%|██████████| 1.16M/1.16M [00:00<00:00, 30.1MB/s]
Downloading https://huggingface.co/stanfordnlp/stanza-it/resolve/v1.4.1/models/pos/combined.pt: 100%|██████████| 34.9M/34.9M [00:00<00:00,

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...7ddbef,0,Mar 29 13:18:37,completed,parsing-parse,kind=jobowner=ericamlrun/client_version=1.2.1host=parsing-parse-n7q2m,input_file,tint_url=None,,tint_files





> 2023-03-29 15:19:40,928 [info] run executed, status=completed


In [16]:
parsing_run.outputs

{'tint_files': 's3://mlrun/projects/document-classification/artifacts/parsing-parse/0/tint_files.zip'}

### Register the function that extracts test sets and run it

In [9]:
extracting_fn = project.set_function(
    name="extracting_test",
    func="03-extracting_test_handlers.py",
    handler="extract_test_sets",
    kind="job",
    image="mlrun/mlrun"
)

In [10]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [17]:
extracting_run = project.run_function(
    "extracting_test",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"], "tint_files": parsing_run.outputs["tint_files"]},
    params={"testRatio": 0.2, "devRatio": 0.2}
)

> 2023-03-29 15:20:19,740 [info] starting run extracting-test-extract_test_sets uid=7e3bb9222afe4f118c3229d6bdadff11 DB=http://127.0.0.1:30070
> 2023-03-29 15:20:19,989 [info] Job is running in the background, pod: extracting-test-extract-test-sets-v6x87
INFO:root:Loading JSON file
INFO:root:Downloading and extracting tint_files.zip
INFO:root:Shuffling data
INFO:root:Extracting texts
INFO:root:Saving test list
INFO:root:Saving dev list
INFO:root:Train size: 50
INFO:root:Test size: 19
INFO:root:Dev size: 19
> 2023-03-29 13:20:26,284 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 7e3bb9222afe4f118c3229d6bdadff11 -p document-classification', 'logs_cmd': 'mlrun logs 7e3bb9222afe4f118c3229d6bdadff11 -p document-classification'}
> 2023-03-29 13:20:26,285 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...adff11,0,Mar 29 13:20:25,completed,extracting-test-extract_test_sets,kind=jobowner=ericamlrun/client_version=1.2.1host=extracting-test-extract-test-sets-v6x87,input_filetint_files,testRatio=0.2devRatio=0.2,,testlistdevlist





> 2023-03-29 15:20:29,464 [info] run executed, status=completed


In [18]:
extracting_run.outputs

{'testlist': 's3://mlrun/projects/document-classification/artifacts/extracting-test-extract_test_sets/0/testlist.txt',
 'devlist': 's3://mlrun/projects/document-classification/artifacts/extracting-test-extract_test_sets/0/devlist.txt'}

### Register the function for saving data and run it

In [11]:
saving_fn = project.set_function(
    name="saving_data",
    func="04-saving_data_handlers.py",
    handler="save_data",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

In [12]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [19]:
saving_run = project.run_function(
    "saving_data",
    inputs={"input_file": preproc_run.outputs["preprocessed_data"],
            "test_list_file": extracting_run.outputs["testlist"],
            "dev_list_file": extracting_run.outputs["devlist"],
            "tint_files": parsing_run.outputs["tint_files"]}
)

> 2023-03-29 15:21:25,138 [info] starting run saving-data-save_data uid=ff53af47ae0e4f06bb1fb0f1639e7b50 DB=http://127.0.0.1:30070
> 2023-03-29 15:21:25,393 [info] Job is running in the background, pod: saving-data-save-data-55lrk
INFO:root:Loading JSON file
INFO:root:Downloading and extracting tint_files.zip
INFO:root:Extracting texts
100%|██████████| 88/88 [00:00<00:00, 1185.00it/s]
INFO:root:Saving file
> 2023-03-29 13:21:32,209 [info] To track results use the CLI: {'info_cmd': 'mlrun get run ff53af47ae0e4f06bb1fb0f1639e7b50 -p document-classification', 'logs_cmd': 'mlrun logs ff53af47ae0e4f06bb1fb0f1639e7b50 -p document-classification'}
> 2023-03-29 13:21:32,209 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...9e7b50,0,Mar 29 13:21:31,completed,saving-data-save_data,kind=jobowner=ericamlrun/client_version=1.2.1host=saving-data-save-data-55lrk,input_filetest_list_filedev_list_filetint_files,,,complete





> 2023-03-29 15:21:34,864 [info] run executed, status=completed


In [20]:
saving_run.outputs

{'complete': 's3://mlrun/projects/document-classification/artifacts/saving-data-save_data/0/complete.json'}

### Register the filtering function and save it

In [13]:
filtering_fn = project.set_function(
    name="filtering",
    func="05-filtering_handlers.py",
    handler="filter",
    kind="job",
    image="ertomaselli/classification-parsing:latest"
)

In [14]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [21]:
filtering_run = project.run_function(
    "filtering",
    inputs={"complete_json_file": saving_run.outputs["complete"]},
    params={"minFreq": 3}
)

> 2023-03-29 15:21:51,686 [info] starting run filtering-filter uid=840f5151dd9348a781de14101a21956d DB=http://127.0.0.1:30070
> 2023-03-29 15:21:51,896 [info] Job is running in the background, pod: filtering-filter-twhzr
INFO:root:Loading JSON file
INFO:root:goodTokens: writing unfiltered files
INFO:root:goodTokens: 50 documents
INFO:root:goodTokens: 3 labels
INFO:root:goodTokens-by_document: extracting frequencies
INFO:root:goodTokens-by_document: extracting stopwords
INFO:root:goodTokens-by_document: stopwords size: 246
INFO:root:goodTokens-by_document: removing stopwords
INFO:root:goodTokens-by_document: calculating TF-IDF
INFO:root:goodTokens-by_document: collecting weights
INFO:root:goodTokens-by_document: ok words size: 51
INFO:root:goodTokens-by_document: writing filtered files
100%|██████████| 88/88 [00:00<00:00, 235544.83it/s]
INFO:root:goodTokens-by_label: extracting frequencies
INFO:root:goodTokens-by_label: extracting stopwords
INFO:root:goodTokens-by_label: stopwords size:

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...21956d,0,Mar 29 13:21:56,completed,filtering-filter,kind=jobowner=ericamlrun/client_version=1.2.1host=filtering-filter-twhzr,complete_json_file,minFreq=3,,filtering_files





> 2023-03-29 15:21:58,133 [info] run executed, status=completed


In [22]:
filtering_run.outputs

{'filtering_files': 's3://mlrun/projects/document-classification/artifacts/filtering-filter/0/filtering_files.zip'}

### Register the training function and run it

In [15]:
#image has been built with:
# docker build -t classification-training:latest -<<EOF
# FROM mlrun/mlrun:1.2.1
# RUN apt-get update
# RUN apt-get install build-essential -y
# RUN pip install fasttext
# EOF
training_fn = project.set_function(
    name="training",
    func="training_handlers.py",
    handler="train",
    kind="job",
    image="ertomaselli/classification-training:latest"
)

In [16]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [27]:
training_run = project.run_function(
    "training",
    inputs={"training_files": filtering_run.outputs["filtering_files"]}
)

> 2023-03-29 15:38:04,255 [info] starting run training-train uid=4fe9cf5d9cf349cbbeb2b81e6a3c8a8b DB=http://127.0.0.1:30070
> 2023-03-29 15:38:05,315 [info] Job is running in the background, pod: training-train-69qk5
INFO:root:Downloading and extracting filtering_files.zip
INFO:root:Creating goodTokens model
Read 0M words
Number of words:  301
Number of labels: 3
Progress: 100.0% words/sec/thread:   29295 lr:  0.000000 avg.loss:  0.142260 ETA:   0h 0m 0s
INFO:root:Testing goodTokens model
INFO:root:Predicting with goodTokens_unfiltered_model.bin model for the whole test data
INFO:root:Logging goodTokens artifacts
INFO:root:Creating goodTokens_by_document model
Read 0M words
Number of words:  52
Number of labels: 3
Progress: 100.0% words/sec/thread:   15941 lr:  0.000000 avg.loss:      -nan ETA:   0h 0m 0s
INFO:root:Testing goodTokens_by_document model
INFO:root:Predicting with goodTokens_by_document_filtered_model.bin model for the whole test data
INFO:root:Logging goodTokens_by_docume

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...3c8a8b,0,Mar 29 13:38:19,completed,training-train,kind=jobowner=ericamlrun/client_version=1.2.1host=training-train-69qk5,training_files,,,goodTokens_unfiltered_modelgoodTokens_by_document_filtered_modelgoodTokens_by_label_filtered_modelallLemmas_unfiltered_modelallLemmas_by_document_filtered_modelallLemmas_by_label_filtered_modelallTokens_unfiltered_modelallTokens_by_document_filtered_modelallTokens_by_label_filtered_modelresults





> 2023-03-29 15:38:25,376 [info] run executed, status=completed


In [35]:
training_run.outputs

{'goodTokens_unfiltered_model': 'store://artifacts/document-classification/training-train_goodTokens_unfiltered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'goodTokens_by_document_filtered_model': 'store://artifacts/document-classification/training-train_goodTokens_by_document_filtered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'goodTokens_by_label_filtered_model': 'store://artifacts/document-classification/training-train_goodTokens_by_label_filtered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'allLemmas_unfiltered_model': 'store://artifacts/document-classification/training-train_allLemmas_unfiltered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'allLemmas_by_document_filtered_model': 'store://artifacts/document-classification/training-train_allLemmas_by_document_filtered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'allLemmas_by_label_filtered_model': 'store://artifacts/document-classification/training-train_allLemmas_by_label_filtered_model:4fe9cf5d9cf349cbbeb2b81e6a3c8a8b',
 'allTokens_unfilter

### Register the evaluation function and run it

In [17]:
evaluation_fn = project.set_function(
    name="evaluation",
    func="06-evaluation_handlers.py",
    handler="evaluate",
    kind="job",
    image="mlrun/mlrun"
)

In [18]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [29]:
evaluation_run = project.run_function(
    "evaluation",
    inputs={"pred_files": training_run.outputs["results"], "gold_files": filtering_run.outputs["filtering_files"]},
    params={"show_cm": True}
)

> 2023-03-29 15:38:48,028 [info] starting run evaluation-evaluate uid=151456a65da344008648d1c5d9f76118 DB=http://127.0.0.1:30070
> 2023-03-29 15:38:48,339 [info] Job is running in the background, pod: evaluation-evaluate-8jc8b
INFO:root:Downloading and extracting pred_files
INFO:root:Downloading and extracting gold_files
INFO:root:Pred file: results/goodTokens_by_document_filtered.results.txt
INFO:root:Test file: filtering_files/goodTokens_by_document_filtered.test.txt
INFO:root:Pred file: results/allLemmas_by_label_filtered.results.txt
INFO:root:Test file: filtering_files/allLemmas_by_label_filtered.test.txt
INFO:root:Pred file: results/allLemmas_by_document_filtered.results.txt
INFO:root:Test file: filtering_files/allLemmas_by_document_filtered.test.txt
INFO:root:Pred file: results/allTokens_by_label_filtered.results.txt
INFO:root:Test file: filtering_files/allTokens_by_label_filtered.test.txt
INFO:root:Pred file: results/allTokens_unfiltered.results.txt
INFO:root:Test file: filterin

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
document-classification,...f76118,0,Mar 29 13:38:56,completed,evaluation-evaluate,kind=jobowner=ericamlrun/client_version=1.2.1host=evaluation-evaluate-8jc8b,pred_filesgold_files,show_cm=True,"goodTokens_by_document_filtered_macro=0.9157894736842106goodTokens_by_document_filtered_micro=0.9473684210526315goodTokens_by_document_filtered_weighted=0.9434903047091412goodTokens_by_document_filtered_confusion_matrix={}allLemmas_by_label_filtered_macro=0.9111111111111111allLemmas_by_label_filtered_micro=0.9473684210526315allLemmas_by_label_filtered_weighted=0.9438596491228071allLemmas_by_label_filtered_confusion_matrix={}allLemmas_by_document_filtered_macro=0.9157894736842106allLemmas_by_document_filtered_micro=0.9473684210526315allLemmas_by_document_filtered_weighted=0.9434903047091412allLemmas_by_document_filtered_confusion_matrix={}allTokens_by_label_filtered_macro=0.7916666666666666allTokens_by_label_filtered_micro=0.8947368421052632allTokens_by_label_filtered_weighted=0.875allTokens_by_label_filtered_confusion_matrix={'__label__A0330': {'Total': 3, 'Errors': 2, 'Ratio': 0.6666666666666666, 'Errors_matrix': {'__label__A1810': 2}}}allTokens_unfiltered_macro=0.9157894736842106allTokens_unfiltered_micro=0.9473684210526315allTokens_unfiltered_weighted=0.9434903047091412allTokens_unfiltered_confusion_matrix={}goodTokens_unfiltered_macro=0.9111111111111111goodTokens_unfiltered_micro=0.9473684210526315goodTokens_unfiltered_weighted=0.9438596491228071goodTokens_unfiltered_confusion_matrix={}allTokens_by_document_filtered_macro=0.7935672514619884allTokens_by_document_filtered_micro=0.8947368421052632allTokens_by_document_filtered_weighted=0.8715604801477377allTokens_by_document_filtered_confusion_matrix={}allLemmas_unfiltered_macro=0.9111111111111111allLemmas_unfiltered_micro=0.9473684210526315allLemmas_unfiltered_weighted=0.9438596491228071allLemmas_unfiltered_confusion_matrix={}goodTokens_by_label_filtered_macro=0.8055555555555557goodTokens_by_label_filtered_micro=0.8421052631578947goodTokens_by_label_filtered_weighted=0.8421052631578947goodTokens_by_label_filtered_confusion_matrix={}",





> 2023-03-29 15:38:57,825 [info] run executed, status=completed


In [30]:
evaluation_run.outputs

{'goodTokens_by_document_filtered_macro': 0.9157894736842106,
 'goodTokens_by_document_filtered_micro': 0.9473684210526315,
 'goodTokens_by_document_filtered_weighted': 0.9434903047091412,
 'goodTokens_by_document_filtered_confusion_matrix': {},
 'allLemmas_by_label_filtered_macro': 0.9111111111111111,
 'allLemmas_by_label_filtered_micro': 0.9473684210526315,
 'allLemmas_by_label_filtered_weighted': 0.9438596491228071,
 'allLemmas_by_label_filtered_confusion_matrix': {},
 'allLemmas_by_document_filtered_macro': 0.9157894736842106,
 'allLemmas_by_document_filtered_micro': 0.9473684210526315,
 'allLemmas_by_document_filtered_weighted': 0.9434903047091412,
 'allLemmas_by_document_filtered_confusion_matrix': {},
 'allTokens_by_label_filtered_macro': 0.7916666666666666,
 'allTokens_by_label_filtered_micro': 0.8947368421052632,
 'allTokens_by_label_filtered_weighted': 0.875,
 'allTokens_by_label_filtered_confusion_matrix': {'__label__A0330': {'Total': 3,
   'Errors': 2,
   'Ratio': 0.6666666

### Define and run a pipeline

In [19]:
project.set_workflow(
    "classification",
    workflow_path="classification_pipeline.py",
    engine="kfp",
    handler="classification_pipeline"
)

In [20]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fa3b2b95520>

In [31]:
run_id = project.run(
    name="classification",
    # arguments={
    #     "data_format": "parse_ipzs",
    #     "bucket_name": "ipzs", "idPrefix": "ipzs-", "limit": 10, "max_documents": 100,
    #     "tint_url": None,
    #     "testRatio": 0.2, "devRatio": 0.2
    # }, 
    watch=True
)

> 2023-03-29 15:39:52,119 [info] submitted pipeline document-classification-classification 2023-03-29 13-39-51 id=2bda333c-0020-44bd-a438-84fc38bbbc43
> 2023-03-29 15:39:52,121 [info] Pipeline run id=2bda333c-0020-44bd-a438-84fc38bbbc43, check UI for progress


> 2023-03-29 15:39:52,224 [info] started run workflow document-classification-classification with run id = '2bda333c-0020-44bd-a438-84fc38bbbc43' by kfp engine
> 2023-03-29 15:39:52,224 [info] waiting for pipeline run completion
> 2023-03-29 15:50:36,650 [error] error cannot get pipeline: HTTPConnectionPool(host='127.0.0.1', port=30070): Max retries exceeded with url: /api/v1/projects/document-classification/pipelines/2bda333c-0020-44bd-a438-84fc38bbbc43?namespace=mlrun&format=summary (Caused by ReadTimeoutError("HTTPConnectionPool(host='127.0.0.1', port=30070): Read timed out. (read timeout=10)"))
> 2023-03-29 15:51:32,761 [error] error cannot get pipeline: HTTPConnectionPool(host='127.0.0.1', port=30070): Max retries exceeded with url: /api/v1/projects/document-classification/pipelines/2bda333c-0020-44bd-a438-84fc38bbbc43?namespace=mlrun&format=summary (Caused by ReadTimeoutError("HTTPConnectionPool(host='127.0.0.1', port=30070): Read timed out. (read timeout=10)"))
> 2023-03-29 15:5

KeyboardInterrupt: 

### Deploy the models

In [36]:
serving_fn = mlrun.new_function("model-server", kind="serving", image="ertomaselli/classification-tqdm-stanza-fasttext:latest", project="document-classification")

#model_path = training_run.outputs["allTokens_unfiltered_model"]
model_path = "./allTokens_unfiltered_model.bin" #test with single local model

# set the topology/router and add models
graph = serving_fn.set_topology("router")
serving_fn.add_model("allTokens_unfiltered_model", model_path=model_path, class_name="model_serving.ClassifierModel")

project.set_function(serving_fn)
project.save()

<mlrun.projects.project.MlrunProject at 0x7f8e3c9f4490>

In [37]:
#test function locally
server = serving_fn.to_mock_server()



INFO:root:extra_data: {}


> 2023-03-30 11:45:03,672 [info] model allTokens_unfiltered_model was loaded
> 2023-03-30 11:45:03,672 [info] Loaded ['allTokens_unfiltered_model']


In [38]:
csv_path = "/Users/erica/document-classification/input-folder/atti_materie_SG_nov2021.csv"
text = "Norme in materia tributaria, di previdenza, di assunzioni nella pubblica amministrazione ed altre disposizioni urgenti."

server.test("/v2/models/allTokens_unfiltered_model/infer", body={"inputs": [csv_path, text]})

2023-03-30 11:45:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 32.7MB/s]                    
2023-03-30 11:45:35 INFO: Loading these models for language: it (Italian):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| pos       | combined |
| lemma     | combined |

INFO:stanza:Loading these models for language: it (Italian):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| pos       | combined |
| lemma     | combined |

20

{'id': '055290e59c0841e6b9a84fdfe8175bc6',
 'model_name': 'allTokens_unfiltered_model',
 'outputs': ['{"topics": {"A1810 - CONVENZIONI E TRATTATI INTERNAZIONALI": 0.464219868183136, "A0300 - AMBIENTE (TUTELA DELL\')": 0.4267939329147339, "A3840 - ISTRUZIONE PUBBLICA": 0.06263402849435806, "A1688 - CONTABILITA\' E BILANCIO": 0.020770074799656868, "A2520 - ECONOMIA E FINANZA": 0.01584814302623272}, "words": [{"tokens": [{"index": 1, "originalText": "Norme", "word": "Norme", "featuresText": "_", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "isMultiwordToken": false, "isMultiwordFirstToken": false, "pos": "S", "ud_pos": "NOUN", "lemma": "norma"}, {"index": 2, "originalText": "in", "word": "in", "featuresText": "_", "characterOffsetBegin": 6, "characterOffsetEnd": 8, "isMultiwordToken": false, "isMultiwordFirstToken": false, "pos": "E", "ud_pos": "ADP", "lemma": "in"}, {"index": 3, "originalText": "materia", "word": "materia", "featuresText": "_", "characterOffsetBegin": 9, "characte

In [42]:
#serving_fn.deploy()
mlrun.deploy_function(serving_fn)

> 2023-03-30 13:40:58,559 [info] Starting remote function deploy
2023-03-30 11:40:58  (info) Deploying function
2023-03-30 11:40:58  (info) Building
2023-03-30 11:40:58  (info) Staging files and preparing base images
2023-03-30 11:40:58  (info) Building processor image
Failed to deploy. Details:

Error - Job failed. Job logs:
error checking push permissions -- make sure you entered the correct tag name, and that you are authenticated correctly, and try again: checking push permission for "index.docker.io/smartcommunitylab/processor-document-classification-model-server:latest": POST https://index.docker.io/v2/smartcommunitylab/processor-document-classification-model-server/blobs/uploads/: UNAUTHORIZED: authentication required; [map[Action:pull Class: Name:smartcommunitylab/processor-document-classification-model-server Type:repository] map[Action:push Class: Name:smartcommunitylab/processor-document-classification-model-server Type:repository]]
    /nuclio/pkg/processor/build/builder.go

RunError: function model-server deployment failed