# Set-up

In [1]:
# we need boto3 to access AWS s3 buckets for storage
!pip install boto3
from IPython.display import clear_output
clear_output()

In [2]:
# this will install a library to help with reading/saving files from/to s3
!git clone https://github.com/aguille-vert/s3-operator

import sys
sys.path.append('/content/s3-operator')

import s3_operator as oper

Cloning into 's3-operator'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 76 (delta 37), reused 41 (delta 15), pack-reused 0[K
Receiving objects: 100% (76/76), 14.91 KiB | 7.46 MiB/s, done.
Resolving deltas: 100% (37/37), done.


In [3]:
import json
import boto3
import pandas as pd
import numpy as np
from io import BytesIO
from PIL import Image
import requests
from random import choice, choices
import re
from datetime import datetime, timedelta
from pprint import pprint
from time import sleep
from collections import defaultdict
import os

## clients and tokens

In [4]:
from google.colab import userdata


AWS_BRG_ACCESS_KEY = userdata.get('AWS_BRG_ACCESS_KEY')
AWS_BRG_SECRET_ACCESS_KEY = userdata.get('AWS_BRG_SECRET_ACCESS_KEY')


s3_client = boto3.client('s3',
            aws_access_key_id = AWS_BRG_ACCESS_KEY,
            aws_secret_access_key = AWS_BRG_SECRET_ACCESS_KEY)


# Classification task - analysis of inference results

Examples why title of a file on its own is not necessarily a good pointer to its type:

* '6047902 Contract R-1' is an invoice. The file name is misleading in this case. Both 7b and 8b models were correct in classifying this document as an invoice.
* 'A3016-Contract-63220-04.15.24' is an order. 7b and 8b models incorrectly classified it as Invoice and Contract.
* 'DNC-BIDEN FOR PRESIDENT KYW CONTRACT 605352 IN' is an invoice
* Biden for President 240402-240408 Est 068 Order_WBAY is a contract

## load inreference results

In [6]:
bucket_name = 'INSERT YOUR BUCKET NAME HERE'

In [7]:
MODEL = 'gemma-7b-it'

api = 'groq'

bucket = bucket_name
key = f"FCC/completions/file-classification/{api}/{MODEL}/completions_df.parquet"
try:
  df_7b = oper.pd_read_parquet(s3_client, bucket, key)
except Exception as e:
  print("failed to read file", e)
  df_7b = pd.DataFrame(columns=['key',
                                           'completion',
                                           'model',
                                           'time',
                                           'prompt'])
df_7b['file_name']=df_7b['key'].str.split('/').str[2]
df_7b['completion'] = df_7b['completion'].apply(lambda x:
                        list(json.loads(x).values())[0]
                        if isinstance(x,str) else x)
print(f"df_7b: {df_7b.shape}")

df_7b: (6080, 6)


In [8]:
MODEL = 'llama3-8b-8192'

api = 'groq'

bucket = bucket_name
key = f"FCC/completions/file-classification/{api}/{MODEL}/completions_df.parquet"
try:
  df_8b = oper.pd_read_parquet(s3_client, bucket, key)
except Exception as e:
  print("failed to read file", e)
  df_8b = pd.DataFrame(columns=['key',
                                           'completion',
                                           'model',
                                           'time',
                                           'prompt'])
df_8b['file_name']=df_8b['key'].str.split('/').str[2]
df_8b['completion'] = df_8b['completion'].apply(lambda x:
                        list(json.loads(x).values())[0]
                        if isinstance(x,str) else x)
print(f"df_8b: {df_8b.shape}")

df_8b: (2514, 6)


In [9]:
MODEL = 'llama3-70b-8192'

api = 'groq'

bucket = bucket = bucket_name
key = f"FCC/completions/file-classification/{api}/{MODEL}/completions_df.parquet"
try:
  df_70b = oper.pd_read_parquet(s3_client, bucket, key)
  print(df_70b.shape)

except Exception as e:
  print("failed to read file", e)

  df_70b = pd.DataFrame(columns=['key',
                                          'completion',
                                          'model',
                                          'time',
                                          'prompt'])
df_70b['file_name']=df_70b['key'].str.split('/').str[2]
df_70b['completion'] = df_70b['completion'].apply(lambda x:
                        list(json.loads(x).values())[0]
                        if isinstance(x,str) else x)

print(f"df_70b: {df_70b.shape}")

(1117, 5)
df_70b: (1117, 6)


## merge inference results

let's merge inference results of our 3 models and analyze in which instances the models coincided, in which - diverged.

In [10]:
df_7b['file_name'] = df_7b['key'].str.split('/').str[2]

df = df_7b[['key','file_name','completion']].merge(df_8b[['key',
                        'completion']],
                      on='key',
                      suffixes = ('_7b', '_8b'),
                      how='inner').drop_duplicates(ignore_index=True).dropna(how='any')
df.shape

(2142, 4)

In [11]:
df

Unnamed: 0,key,file_name,completion_7b,completion_8b
0,FCC/extracted_texts/POL LUC 2024 wk of April 8...,POL LUC 2024 wk of April 8,Contract,Receipt
1,FCC/extracted_texts/WMSN -BIDEN FOR PRESIDENT ...,WMSN -BIDEN FOR PRESIDENT Est11597 Period 4.1....,Contract,Invoice
2,FCC/extracted_texts/WMSN -BIDEN FOR PRESIDENT ...,WMSN -BIDEN FOR PRESIDENT Est11598 Period 4.1....,Contract,Invoice
3,FCC/extracted_texts/WMYD Biden est 11700 1285...,WMYD Biden est 11700 1285530,Contract,Order
4,FCC/extracted_texts/WMYD Biden est 11710 1286...,WMYD Biden est 11710 1286114,Contract,Order
...,...,...,...,...
2535,FCC/extracted_texts/Pol J Biden D PRE US_NAB-1...,Pol J Biden D PRE US_NAB-1128576,Order,Contract
2536,FCC/extracted_texts/6030864 Invoice/extracted_...,6030864 Invoice,Invoice,Invoice
2537,FCC/extracted_texts/A3016-NAB-62398-032924/ext...,A3016-NAB-62398-032924,Order,Contract
2538,FCC/extracted_texts/Biden for presiden Invoice...,Biden for presiden Invoice 74406-1,Invoice,Invoice


## common_df/divergent_df
we are going to split our df into 2 non-intersecting parts:
* common_df where the labels assigned by the 2 models are the same;
* divergent_df where the labels assigned by the 2 models are not the same

In [12]:
common_df = df.query("completion_7b == completion_8b").\
                    reset_index(drop=True).dropna(how='any')
divergent_df = df.query("completion_7b ! = completion_8b").\
                    reset_index(drop=True).dropna(how='any')

common_df['same'] = True
divergent_df['same'] = False

df = pd.concat([common_df,divergent_df], ignore_index=True)
df.shape
common_df.shape, divergent_df.shape, df.shape

((1482, 5), (660, 5), (2142, 5))

we have 660 files out of a total of 2142 in which labels assigned by one model are not the same as those assigned by the second model. This is an indication of the accuracy of the zero-shot models: it's quite low, even though our classification task seems to be quite a simple one.

## Analysis of inference results

let's compare classification inference results obtained with the following 3 models:
* 'gemma-7b-it'
* 'llama3-8b-8192'
* 'llama3-70b-8192'

### documents containing NAB in the title
NAB stands for National Association of Broadcasters. The Political Broadcast Agreement form, often referred to as the NAB Form PB-18, is used by broadcasters to document agreements with political candidates and committees regarding the purchase of airtime for political advertising. This form helps ensure compliance with Federal Communications Commission (FCC) regulations regarding political advertising.  

In our classification labels such documents shall be classified as 'Other'.

In [None]:
nab_common = common_df.query("file_name.str.contains('nab',case=False)")
nab_common.shape

As we can see from below, both models labeled NAB documents either as 'contract' or as 'invoice'.  
One of the goals of classification model fine-tuning is to traine models to classify such files as 'Other'

In [None]:
nab_common['completion_7b'].value_counts()

In [None]:
nab_divergent = df.query("completion_7b != completion_8b").\
                    reset_index(drop=True).dropna(how='any')
nab_divergent.shape

In [None]:
labels = ['Contract', 'Invoice', 'Order']
erorr_counts = nab_divergent.query('completion_7b==@labels').shape[0]
print(f"erorr_counts: {erorr_counts}")
error_pct = erorr_counts/len(nab_divergent)
print(f"error_pct: {error_pct*100:.2f} %")

In [None]:
labels = ['Contract', 'Invoice', 'Order']
erorr_counts = nab_divergent.query('completion_8b==@labels').shape[0]
print(f"erorr_counts: {erorr_counts}")
error_pct = erorr_counts/len(nab_divergent)
print(f"error_pct: {error_pct*100:.2f} %")

In [None]:
divergent_df.query("completion_70b != completion_7b").shape

In [None]:
divergent_df.query("completion_70b != completion_8b").shape

In [None]:
divergent_df = divergent_df.merge(df_70b[['key','completion']],on=['key'],
                   how='inner',
                   suffixes=('','_70b')).rename(columns={'completion':'completion_70b'})
divergent_df.shape

## Create and upload labeled dataset

Our analysis above has shown that zero-shot approach to PDF file classification with 7B and 8B models produces a lot of errors.  

We need to creat a dataset and fine-tune out models

let's create 4 DFs, each belonging to one of the 4 labels where our 2 models coincided in classification and where file_name contains text pointing to the appropriate class

In [None]:
contract_df = common_df.query("file_name.str.contains('contract',case=False) \
& completion_7b=='Contract' & completion_8b=='Contract'").copy()
contract_df['label'] = 'Contract'
contract_df.shape

(151, 6)

In [None]:
invoice_df = common_df.query("file_name.str.contains('inv',case=False) \
& completion_7b=='Invoice' & completion_8b=='Invoice'").copy()
invoice_df['label'] = 'Invoice'
invoice_df.shape

(309, 6)

In [None]:
order_df = common_df.query("file_name.str.contains('order',case=False) \
& completion_7b=='Order' & completion_8b=='Order'").copy()
order_df['label'] = 'Order'
order_df.shape

(91, 6)

In [None]:
other_df = df.query("file_name.str.contains('nab',case=False)").copy()
other_df['Label'] = 'Other'
other_df.shape

(364, 5)

In [None]:
ft_df = pd.concat([
                    contract_df,
                    invoice_df,
                    order_df,
                    other_df
                   ],
               ignore_index=True)
ft_df.shape

(915, 7)

In [None]:
ft_df.head()

Unnamed: 0,key,file_name,completion_7b,completion_8b,same
0,FCC/extracted_texts/Joe Biden for President co...,Joe Biden for President confirmation contract ...,Contract,Contract,True
1,FCC/extracted_texts/Biden-D-President Contract...,Biden-D-President Contract 5.7-5.13.24,Contract,Contract,True
2,FCC/extracted_texts/CONTRACT BIDEN FOR PRESIDE...,CONTRACT BIDEN FOR PRESIDENT WNGT EST 11713 05...,Contract,Contract,True
3,FCC/extracted_texts/CONTRACT BIDEN FOR PRESIDE...,CONTRACT BIDEN FOR PRESIDENT WRAL EST 11713 05...,Contract,Contract,True
4,FCC/extracted_texts/BIDEN FOR PRESIDENT KDKA C...,BIDEN FOR PRESIDENT KDKA CONTRACT 635489--1,Contract,Contract,True


In the following cell we are using a custom class ConstructedText. We do not include it here and leave it to the user to experiment with text representation of PDF files, which pertains to the Prompt engineering domain.  

This is a critical part of a successful fine-tuning/inference of LLMs.

we are:
* retrieving raw text files with coordinates which we obtained in notebooks/trump_biden_download_preprocess_store.ipynb,
* converting first page of each PDF file them to a text format with a proptietary class ConstructedText
* creating a list of tuples, each having 3 elements:
  - file_name
  - extracted_text: text extracted from the first page,
  - label: single-label class from ['contract','invoice','order','other']
* converting list of tuples into pandas DF
* upload DF as parquet file to s3 bucket

In [None]:
page_num='0'
max_symbols = 200
tolerance = 3
collector = []
for row in ft_df.itertuples():
  try:
    label = row.label
    key = row.key
    file_name = row.file_name
    extracted_words = json.loads(s3_client.get_object(Bucket=bucket,
                                                      Key=key)['Body'].read())
    extracted_page = {page_num: extracted_words[page_num]}
    text_constructor = ConstructedText(
                                        extracted_page,
                                        tolerance,
                                        max_symbols)
    constructed_text = text_constructor.constructed_text
    collector.append((file_name, constructed_text,label))
    if row[0]%10==0:
      print(row[0])
  except:
    print(f"{row[0]} error: {key}")
  # break
len(collector)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
557 error: FCC/extracted_texts/265668 - Biden For President - KTVW - NAB/extracted_text.json
560
570
580
590
600
610
613 error: FCC/extracted_texts/Biden for President-NAB-WI-Est 11503-BFP6024003H/extracted_text.json
620
625 error: FCC/extracted_texts/NAB PA PRIMARY BIDEN 60S/extracted_text.json
630
640
649 error: FCC/extracted_texts/NAB PA PRIMARY BIDEN 11602/extracted_text.json
650 error: FCC/extracted_texts/NAB PA PRIMARY BIDEN 11603/extracted_text.json
651 error: FCC/extracted_texts/263088 Biden For President KTVW NAB REV1/extracted_text.json
660
667 error: FCC/extracted_texts/NAB WMSN - Biden for President Est 11632 - 4.16.24/extracted_text.json
668 error: FCC/extracted_texts/NAB WMSN - Joe Biden for President Est 11631 - 4.16.24/extracted_text.json
670
680
687 error: FCC

894

In [None]:
completions_df = pd.DataFrame(collector,
                              columns=['file_name',
                                       'constructed_text',
                                       'label'])
completions_df.shape

(894, 3)

let's store the dataset in s3 bucket. We will use it for fine-tuning.

In [None]:
bucket = bucket_name
key = 'datasets/FCC/completions_4_class_900K_ds.parquet'
buffer = BytesIO()
completions_df.to_parquet(buffer,
                          index=False)
metadata = {'description': 'notebook inference_analysis_dataset'}
s3_client.put_object(Bucket=bucket,
                     Key=key,
                     Body=buffer.getvalue(),
                     Metadata = metadata)

{'ResponseMetadata': {'RequestId': 'ZGDFXZG828NJ28K5',
  'HostId': 'el7bB/Ftplhy9qncDrOyfqd4Uk2LsdAemMPesu/9FAUB0zIo26pnUM2yu+qWd0mE8vN/qpAbyKfhV169NP0uF+lNe2TeHflENEd3Sm0VuD8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'el7bB/Ftplhy9qncDrOyfqd4Uk2LsdAemMPesu/9FAUB0zIo26pnUM2yu+qWd0mE8vN/qpAbyKfhV169NP0uF+lNe2TeHflENEd3Sm0VuD8=',
   'x-amz-request-id': 'ZGDFXZG828NJ28K5',
   'date': 'Tue, 28 May 2024 10:35:08 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"9b05904bdccee2392823a32873caed4c"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"9b05904bdccee2392823a32873caed4c"',
 'ServerSideEncryption': 'AES256'}

# validation dataset

In [13]:
validation_df = df_70b.merge(df_8b[['file_name',
                    'completion']],on='file_name',
             suffixes=('','_8b')).merge(df_7b[['file_name',
                                               'completion']],
                                        on='file_name',
                                        suffixes=('_70b','_7b'))
validation_df.shape

(1397, 8)

In [23]:
validation_df['prompt'] = validation_df.prompt.str.split("### Document text starts here:").str[1].\
str.split("Document text ends here").str[0]

In [24]:
validation_df

Unnamed: 0,key,completion_70b,model,time,prompt,file_name,completion_8b,completion_7b
0,FCC/extracted_texts/POL LUC 2024 wk of April 8...,Invoice,llama3-70b-8192,7.625960,4/11/24\n ...,POL LUC 2024 wk of April 8,Receipt,Contract
1,FCC/extracted_texts/WMSN -BIDEN FOR PRESIDENT ...,Invoice,llama3-70b-8192,4.937490,...,WMSN -BIDEN FOR PRESIDENT Est11597 Period 4.1....,Invoice,Contract
2,FCC/extracted_texts/WMSN -BIDEN FOR PRESIDENT ...,Invoice,llama3-70b-8192,7.206039,...,WMSN -BIDEN FOR PRESIDENT Est11598 Period 4.1....,Invoice,Contract
3,FCC/extracted_texts/WMYD Biden est 11700 1285...,Order,llama3-70b-8192,13.601193,...,WMYD Biden est 11700 1285530,Order,Contract
4,FCC/extracted_texts/WMYD Biden est 11710 1286...,Order,llama3-70b-8192,9.085162,...,WMYD Biden est 11710 1286114,Order,Contract
...,...,...,...,...,...,...,...,...
1392,FCC/extracted_texts/NAB WMSN - BIDEN FOR PRESI...,,llama3-70b-8192,0.481690,CANDIDATE ADVERTISEME...,NAB WMSN - BIDEN FOR PRESIDENT EST 11597 4.9.2...,Contract,Order
1393,FCC/extracted_texts/Pol J Biden D PRE US_NABpg...,Contract,llama3-70b-8192,1.404025,CANDIDATE ADVERTI...,Pol J Biden D PRE US_NABpg1_1128595,Contract,Order
1394,FCC/extracted_texts/Pol J Biden D PRE US_NAB-1...,,llama3-70b-8192,0.476954,CANDIDATE ADVERTI...,Pol J Biden D PRE US_NAB-1128576,Contract,Order
1395,FCC/extracted_texts/A3016-NAB-62398-032924/ext...,Contract,llama3-70b-8192,2.232844,CANDIDATE ADVERTISEME...,A3016-NAB-62398-032924,Contract,Order


In [25]:
bucket = bucket_name
key = 'datasets/FCC/test_ds_1400K.parquet'
buffer = BytesIO()
validation_df.to_parquet(buffer,
                          index=False)
metadata = {'description': 'notebook inference_analysis_dataset.ipynb'}
s3_client.put_object(Bucket=bucket,
                     Key=key,
                     Body=buffer.getvalue(),
                     Metadata = metadata)

{'ResponseMetadata': {'RequestId': 'TR5SR5B1M8NMQJD3',
  'HostId': 'CPTrRV06I2yLp1h42s4EFciBCg8QLbPUtVio5CcA7WYGOOkaDOF5heI0hJsSHjTYJrSSzcYqOus=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'CPTrRV06I2yLp1h42s4EFciBCg8QLbPUtVio5CcA7WYGOOkaDOF5heI0hJsSHjTYJrSSzcYqOus=',
   'x-amz-request-id': 'TR5SR5B1M8NMQJD3',
   'date': 'Thu, 30 May 2024 02:04:29 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"4c2345fd098df5a648e836be7de4cbd4"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"4c2345fd098df5a648e836be7de4cbd4"',
 'ServerSideEncryption': 'AES256'}