# **Google Colab File**

This file is responsible for execute all the operations:


*   Data Ingestion
*   Data Preprocessing
*   Model Training
*   Model Evaluation



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/MyDrive/Document Tagging")

In [3]:
!ls

app.py		 documents		    LICENSE	 README.md	   src
artifacts	 Document_Tagging.egg-info  logs	 requirements.txt  static
config		 dvc.lock		    main.py	 script.py	   template.py
deployment.yaml  dvc.yaml		    notebook	 services.yaml	   templates
Dockerfile	 google_colab.ipynb	    params.yaml  setup.py


In [14]:
!pip install -r requirements.txt

Installing collected packages: Document-Tagging
  Attempting uninstall: Document-Tagging
    Found existing installation: Document-Tagging 1.0.0
    Uninstalling Document-Tagging-1.0.0:
      Successfully uninstalled Document-Tagging-1.0.0
  Running setup.py develop for Document-Tagging
Successfully installed Document-Tagging-1.0.0


In [6]:
from src.document_tagging.pipeline.data_ingestion import DataIngestionTrainingPipeline
from src.document_tagging.pipeline.data_preprocessing import DataPreprocessingTrainingPipeline
from src.document_tagging.pipeline.model_training import ModelTrainingPipeline
from src.document_tagging.pipeline.model_evaluation import ModelEvaluationPipeline

In [7]:
# Stage 01: Data Ingestion
!python "src/document_tagging/pipeline/data_ingestion.py"

INFO:root:2023-11-04	13:04:15	>>>>>>>>>>>>>>> Stage: Data Ingestion started <<<<<<<<<<<<<<<
INFO:root:2023-11-04	13:04:16	configure the s3 details
INFO:root:2023-11-04	13:04:16	download the dataset 1 feom s3 bucket, wiki_data.csv
INFO:root:2023-11-04	13:04:17	download the dataset 2 feom s3 bucket, auto_tagged_data.csv
INFO:root:2023-11-04	13:04:17	concat the two dataframe together
INFO:root:2023-11-04	13:04:17	clean and then create the directory artifacts/data
INFO:root:2023-11-04	13:04:17	save the data to local directory, artifacts/data
INFO:root:2023-11-04	13:04:17	>>>>>>>>>>>>>>> Stage: Data Ingestion completed <<<<<<<<<<<<<<< 




In [8]:
# Stage 02: Data Preprocessing
!python "src/document_tagging/pipeline/data_preprocessing.py"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:root:2023-11-04	13:04:39	>>>>>>>>>>>>>>> Stage: Data Preprocessing started <<<<<<<<<<<<<<<
INFO:root:2023-11-04	13:04:39	apply the forward-fill method and remove the duplicates
INFO:root:2023-11-04	13:04:39	separate the X and Y data
INFO:root:2023-11-04	13:05:14	apply the text-preprocessing steps
INFO:root:2023-11-04	13:05:30	apply the text-preprocessing steps
INFO:root:2023-11-04	13:05:30	get unique id for each tag and save into artifacts/data/tag2id.json
INFO:root:2023-11-04	13:05:30	save the number of labels into artifacts/data/labels.json
INFO:root:2023-11-04	13:05:30	apply the word representation on X data
Downloading (…)okenizer_config.json: 100% 28.0/28.0 [00:00<00:00, 117kB/s]
Downloading (…)lve/main/config.json: 100% 570/570 [00:00<00:00, 2.92MB/

In [9]:
# Stage 03: Model Training
!python "src/document_tagging/pipeline/model_training.py"

2023-11-04 13:05:57.758807: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-04 13:05:57.758873: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-04 13:05:57.758919: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO:root:2023-11-04	13:06:00	>>>>>>>>>>>>>>> Stage: Model Training started <<<<<<<<<<<<<<<
Downloading builder script: 6.33kB [00:00, 11.2MB/s]       
Downloading model.safetensors: 100% 440M/440M [00:02<00:00, 191MB/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['c

In [10]:
import pandas as pd

scores = [
  {'eval_loss': 5.139120578765869, 'eval_precision': 0.13027972807310823, 'eval_recall': 0.10058509722939253, 'eval_f1': 0.11352269968438942, 'eval_accuracy': 0.19696452933151432, 'eval_runtime': 107.2048, 'eval_samples_per_second': 55.846, 'eval_steps_per_second': 6.987, 'epoch': 1.0},
  {'eval_loss': 4.835653781890869, 'eval_precision': 0.1608368386096967, 'eval_recall': 0.1250215109275512, 'eval_f1': 0.1406855151045701, 'eval_accuracy': 0.2431787175989086, 'eval_runtime': 47.6644, 'eval_samples_per_second': 125.607, 'eval_steps_per_second': 15.714, 'epoch': 2.0},
  {'eval_loss': 4.775213718414307, 'eval_precision': 0.1857707509881423, 'eval_recall': 0.14154190328686972, 'eval_f1': 0.16066806661132002, 'eval_accuracy': 0.2656889495225102, 'eval_runtime': 46.4061, 'eval_samples_per_second': 129.013, 'eval_steps_per_second': 16.14, 'epoch': 3.0},
  {'eval_loss': 4.753197193145752, 'eval_precision': 0.19628469113697403, 'eval_recall': 0.15092066769919119, 'eval_f1': 0.17063916723416675, 'eval_accuracy': 0.271231241473397, 'eval_runtime': 45.1287, 'eval_samples_per_second': 132.665, 'eval_steps_per_second': 16.597, 'epoch': 4.0},
  {'eval_loss': 4.790319919586182, 'eval_precision': 0.19244218838127466, 'eval_recall': 0.14679056960936154, 'eval_f1': 0.16654463806316197, 'eval_accuracy': 0.2728512960436562, 'eval_runtime': 50.3132, 'eval_samples_per_second': 118.995, 'eval_steps_per_second': 14.887, 'epoch': 5.0},
  {'eval_loss': 4.822341442108154, 'eval_precision': 0.20006801178871003, 'eval_recall': 0.1518671485114438, 'eval_f1': 0.17266679710428487, 'eval_accuracy': 0.2764324693042292, 'eval_runtime': 46.9498, 'eval_samples_per_second': 127.519, 'eval_steps_per_second': 15.953, 'epoch': 6.0},
  {'eval_loss': 4.838274002075195, 'eval_precision': 0.19851284362325372, 'eval_recall': 0.15160901738082946, 'eval_f1': 0.17191921163040297, 'eval_accuracy': 0.27413028649386084, 'eval_runtime': 46.8711, 'eval_samples_per_second': 127.733, 'eval_steps_per_second': 15.98, 'epoch': 7.0}

]

pd.DataFrame(scores)

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,5.139121,0.13028,0.100585,0.113523,0.196965,107.2048,55.846,6.987,1.0
1,4.835654,0.160837,0.125022,0.140686,0.243179,47.6644,125.607,15.714,2.0
2,4.775214,0.185771,0.141542,0.160668,0.265689,46.4061,129.013,16.14,3.0
3,4.753197,0.196285,0.150921,0.170639,0.271231,45.1287,132.665,16.597,4.0
4,4.79032,0.192442,0.146791,0.166545,0.272851,50.3132,118.995,14.887,5.0
5,4.822341,0.200068,0.151867,0.172667,0.276432,46.9498,127.519,15.953,6.0
6,4.838274,0.198513,0.151609,0.171919,0.27413,46.8711,127.733,15.98,7.0


In [None]:
import os

os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/dibyendubiswas1998/Document-Tagging.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "dibyendubiswas1998"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "password"



!export MLFLOW_TRACKING_URI=https://dagshub.com/dibyendubiswas1998/Kidney-Disease-Classification.mlflow
!export MLFLOW_TRACKING_USERNAME=dibyendubiswas1998
!export MLFLOW_TRACKING_PASSWORD=password

In [18]:
# Stage 04: Model Evaluation
!python "src/document_tagging/pipeline/model_evaluation.py"

2023-11-04 14:09:27.236606: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-04 14:09:27.236677: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-04 14:09:27.236717: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO:root:2023-11-04	14:09:32	>>>>>>>>>>>>>>> Stage: Model Evaluation started <<<<<<<<<<<<<<<
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100% 749/749 [01:39<00:00,  3.87it/s]Train

In [19]:
!git init
!dvc init


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/Document Tagging/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m   

In [21]:
!dvc repro

Stage 'data_ingestion' didn't change, skipping
Running stage 'data_preprocessing':
> python src/document_tagging/pipeline/data_preprocessing.py
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:root:2023-11-04	14:18:20	>>>>>>>>>>>>>>> Stage: Data Preprocessing started <<<<<<<<<<<<<<<
INFO:root:2023-11-04	14:18:20	apply the forward-fill method and remove the duplicates
INFO:root:2023-11-04	14:18:20	separate the X and Y data
INFO:root:2023-11-04	14:18:53	apply the text-preprocessing steps
INFO:root:2023-11-04	14:19:09	apply the text-preprocessing steps
INFO:root:2023-11-04	14:19:09	get unique id for each tag and save into artifacts/data/tag2id.json
INFO:root:2023-11-04	14:19:09	save the number of labels into artifacts/data/labels.json
INFO:root:2023-11-04	14:19:09	apply the word representation on X data
IN

In [22]:
scores = [
    {'eval_loss': 5.1000847816467285, 'eval_precision': 0.12864963503649635, 'eval_recall': 0.09705730511099639, 'eval_f1': 0.11064247179990191, 'eval_accuracy': 0.19687926330150068, 'eval_runtime': 106.5551, 'eval_samples_per_second': 56.187, 'eval_steps_per_second': 7.029, 'epoch': 1.0},
    {'eval_loss': 4.824197769165039, 'eval_precision': 0.16252091466815394, 'eval_recall': 0.12536568576837034, 'eval_f1': 0.1415456355952786, 'eval_accuracy': 0.24241132332878582, 'eval_runtime': 46.2635, 'eval_samples_per_second': 129.411, 'eval_steps_per_second': 16.19, 'epoch': 2.0},
    {'eval_loss': 4.771283149719238, 'eval_precision': 0.186075378018506, 'eval_recall': 0.14188607812768886, 'eval_f1': 0.16100371021284904, 'eval_accuracy': 0.2670532060027285, 'eval_runtime': 45.0428, 'eval_samples_per_second': 132.918, 'eval_steps_per_second': 16.629, 'epoch': 3.0},
    {'eval_loss': 4.740667819976807, 'eval_precision': 0.19451901565995525, 'eval_recall': 0.14963001204611942, 'eval_f1': 0.16914697013909152, 'eval_accuracy': 0.2720839017735334, 'eval_runtime': 46.9612, 'eval_samples_per_second': 127.488, 'eval_steps_per_second': 15.949, 'epoch': 4.0},
    {'eval_loss': 4.784672737121582, 'eval_precision': 0.19415073115860518, 'eval_recall': 0.14851144381345724, 'eval_f1': 0.16829173166926678, 'eval_accuracy': 0.27336289222373805, 'eval_runtime': 47.3055, 'eval_samples_per_second': 126.56, 'eval_steps_per_second': 15.833, 'epoch': 5.0},
    {'eval_loss': 4.802765846252441, 'eval_precision': 0.18886116152450091, 'eval_recall': 0.14326277749096542, 'eval_f1': 0.1629317937175849, 'eval_accuracy': 0.2708901773533424, 'eval_runtime': 47.6297, 'eval_samples_per_second': 125.699, 'eval_steps_per_second': 15.725, 'epoch': 6.0},
    {'eval_loss': 4.823117256164551, 'eval_precision': 0.19398293668612482, 'eval_recall': 0.1486835312338668, 'eval_f1': 0.16833901607403798, 'eval_accuracy': 0.273618690313779, 'eval_runtime': 45.0429, 'eval_samples_per_second': 132.918, 'eval_steps_per_second': 16.629, 'epoch': 7.0}
]

pd.DataFrame(scores)

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,5.100085,0.12865,0.097057,0.110642,0.196879,106.5551,56.187,7.029,1.0
1,4.824198,0.162521,0.125366,0.141546,0.242411,46.2635,129.411,16.19,2.0
2,4.771283,0.186075,0.141886,0.161004,0.267053,45.0428,132.918,16.629,3.0
3,4.740668,0.194519,0.14963,0.169147,0.272084,46.9612,127.488,15.949,4.0
4,4.784673,0.194151,0.148511,0.168292,0.273363,47.3055,126.56,15.833,5.0
5,4.802766,0.188861,0.143263,0.162932,0.27089,47.6297,125.699,15.725,6.0
6,4.823117,0.193983,0.148684,0.168339,0.273619,45.0429,132.918,16.629,7.0
