In [9]:
import pandas as pd
import re
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from openai import OpenAI
import json

In [3]:
df = pd.read_csv("labeled_data.csv")
label_filter = ["Restatements", "Errors", "Part III", "Exhibit 101", "Signatures"]
print(label_filter)

['Restatements', 'Errors', 'Part III', 'Exhibit 101', 'Signatures']


In [4]:
def convert_to_list(entry):
    try:
        return ast.literal_eval(entry)
    except:
        pass

df['labels'] = df.labels.apply(convert_to_list)
df['rule_based_label'] = df.rule_based_label.apply(convert_to_list)

In [5]:
def is_in_filtered(labels):
    for label in labels:
        if label not in label_filter:
            return False
    return True

print(df.shape)
df = df[df.labels.apply(lambda x: is_in_filtered(x))]
print(df.shape)

(1192, 5)
(638, 5)


In [6]:
df_sampled = df.sample(20, random_state=42)
df_sampled.head()

Unnamed: 0,file_name,expl_note_gpt,labels,expl_note,rule_based_label
536,q_0001193125-12-359187.txt,The paragraph that talks about the purpose of ...,[Exhibit 101],EXPLANATORY NOTE This Amendment No. 1 to the ...,[Exhibit 101]
501,k_0001214659-15-003301.txt,The purpose of this Form 10-K/A is to provide ...,[Part III],EXPLANATORY NOTE\n\n Wright Investors' Service...,[Part III]
36,k_0001144204-09-022929.txt,The purpose of this Amendment No. 1 on Form 10...,[Part III],EXPLANATORY\nNOTE\n\nThis\nAmendment No. 1 on ...,[Part III]
884,k_0001548123-13-000255.txt,The purpose of this amended Form 10-K is to re...,[Restatements],EXPLANATORY NOTE During the preparation of th...,[Restatements]
902,q_0001193125-09-016547.txt,The Form 10-Q/A is being filed solely to corre...,[Errors],Explanatory Note The Form 10-Q/A is being fil...,"[Restatements, Errors]"


In [7]:
# Step 2: Encode `rule_based_label` and `labels` into arrays of size 11
mlb = MultiLabelBinarizer()
rule_based_label_encoded = mlb.fit_transform(df_sampled['labels'])
labels_encoded = mlb.transform(df_sampled['rule_based_label'])
print(mlb.classes_)

['Errors' 'Exhibit 101' 'Part III' 'Restatements']




In [8]:
print('Accuracy:', accuracy_score(labels_encoded, rule_based_label_encoded))
print('Classification Report:')
print(classification_report(labels_encoded, rule_based_label_encoded, target_names=mlb.classes_))

Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

      Errors       1.00      0.57      0.73         7
 Exhibit 101       0.75      1.00      0.86         3
    Part III       1.00      1.00      1.00         4
Restatements       0.78      0.78      0.78         9

   micro avg       0.86      0.78      0.82        23
   macro avg       0.88      0.84      0.84        23
weighted avg       0.88      0.78      0.81        23
 samples avg       0.85      0.75      0.78        23



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [65]:
client = OpenAI() # set export OPENAI_API_KEY='your-api-key-here' on terminal before opening jupyter lab

def gpt_annotation(row, model="gpt-4o", temperature=0):
    x = row['expl_note_gpt']
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
         {
          "role": "system",
          "content": [
            {
              "type": "text",
              "text": "You are an expert in SEC filings. Your task is to tag the given SEC note based on its content. Each note should be tagged with one or more appropriate tags from the provided list. Most notes will have a single tag, but there may be cases where two or three tags are necessary. More than three tags will be rare.\n\nTags should be strictly chosen from the following list given in backticks:\n```\nRestatements\nErrors\nPart III\nExhibit 101\nSignatures\n```\n\nHere are the definitions of each tag:\n\nRestatements:\nan amendment used to restate or for restatement. Example: filing an Amendment No. 1 on Form 10-Q/A to amend and restate our unaudited condensed consolidated financial statements\n\nErrors:\nan amendment to correct an error or miscalculation. Example: This amendment is being filed to correct the Company's authorized common stock, a calculation error in Footnote 8\\n \n\nPart III:\nAmendments that are done to just provide information in Part III of the original filing. Here is an example: \"being filed to furnish the information required by Items 10, 11, 12, 13 and 14 of Part III of the Original Filing\"\n\nExhibit 101:\namendment that are done to provide Exhibit 101. Example: The purpose of this Amendment No. 1 to the Old Republic International Corporation (\"Old Republic\") Quarterly Report on Form 10-Q for the quarterly period ended June 30, 2010, filed with the Securities and Exchange Commission on August 9, 2010, is to furnish Exhibit 101 to the Form 10-Q\n\nSignatures:\nAmendments that are filed because the original didn't have the signatures of the executives. Here is an example: RELM Wireless Corporation hereby amends its Quarterly Report on Form 10-Q for the quarterly period ended March 31, 2012, filed on May 9, 2012, solely for the purpose of including the conformed signatures to the Form 10-Q on page 16, which were inadvertently omitted from the original filing.\n\nYou also need to reason why you have picked a particular tag. The reason should justify the tag and mention which part of the note is relevant to the tag. \n\nOutput format is JSON:\n[\n    {\n        \"reason\": \"This is the reason why I choose tag 1\",\n        \"tag\": \"tag 1\"\n    },\n    //similarly other tags\n]\n\nReturn an empty list if none of the given tags are applicable to the note.\n"
            }
          ]
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": x
            }
          ]
        }
      ],
      temperature=temperature,
      max_tokens=4095,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    print('------')
    print(x)
    print(response.choices[0].message.content)
    json_resp = json.loads(response.choices[0].message.content)
    labels = []
    for item in json_resp:
        labels.append(item['tag'])
    print(labels)
    if labels != row['labels']:
        print(row['labels'])
        print(row['rule_based_label'])
        # print(row['expl_note'])
    print('------')
    return labels

# gpt_annotation(df_sampled.iloc[0, :])
    

In [76]:
df_sampled['labels_gpt3'] = df_sampled.apply(lambda x: gpt_annotation(x), axis=1)

------
The paragraph that talks about the purpose of the amendment/form in the given SEC note is:

"This Amendment No. 1 to the Quarterly Report on Form 10-Q (the "Form 10-Q") of Wireless Ronin Technologies, Inc. for the period ended June 30, 2012 is being filed solely to furnish Exhibit 101 to the Form 10-Q in accordance with Rule 405 of Regulation S-T. Exhibit 101 to the Form 10-Q provides the consolidated financial statements and related notes from the Form 10-Q formatted in XBRL (eXtensible Business Reporting Language). No other changes have been made to the Form 10-Q."
[
    {
        "reason": "The note specifically mentions that the purpose of the amendment is to furnish Exhibit 101 to the Form 10-Q, which is a clear indication for the Exhibit 101 tag.",
        "tag": "Exhibit 101"
    }
]
['Exhibit 101']
------
------
The purpose of this Form 10-K/A is to provide the information required to be disclosed in Part III, Items 10 through 14 and updates the information contained in 

In [66]:
df_sampled['labels_gpt4o'] = df_sampled.apply(lambda x: gpt_annotation(x, model='gpt-4o'), axis=1)

------
The paragraph that talks about the purpose of the amendment/form in the given SEC note is:

"This Amendment No. 1 to the Quarterly Report on Form 10-Q (the "Form 10-Q") of Wireless Ronin Technologies, Inc. for the period ended June 30, 2012 is being filed solely to furnish Exhibit 101 to the Form 10-Q in accordance with Rule 405 of Regulation S-T. Exhibit 101 to the Form 10-Q provides the consolidated financial statements and related notes from the Form 10-Q formatted in XBRL (eXtensible Business Reporting Language). No other changes have been made to the Form 10-Q."
[
    {
        "reason": "The note specifically mentions that the purpose of the amendment is to furnish Exhibit 101 to the Form 10-Q, which is a clear indication for tagging it as Exhibit 101.",
        "tag": "Exhibit 101"
    }
]
['Exhibit 101']
------
------
The purpose of this Form 10-K/A is to provide the information required to be disclosed in Part III, Items 10 through 14 and updates the information contain

In [78]:
df_sampled['labels_gpt4'] = df_sampled.apply(lambda x: gpt_annotation(x, model='gpt-4'), axis=1)

------
The paragraph that talks about the purpose of the amendment/form in the given SEC note is:

"This Amendment No. 1 to the Quarterly Report on Form 10-Q (the "Form 10-Q") of Wireless Ronin Technologies, Inc. for the period ended June 30, 2012 is being filed solely to furnish Exhibit 101 to the Form 10-Q in accordance with Rule 405 of Regulation S-T. Exhibit 101 to the Form 10-Q provides the consolidated financial statements and related notes from the Form 10-Q formatted in XBRL (eXtensible Business Reporting Language). No other changes have been made to the Form 10-Q."
[
    {
        "reason": "The note specifically mentions that the purpose of the amendment is to furnish Exhibit 101 to the Form 10-Q, which is a clear indication for the Exhibit 101 tag.",
        "tag": "Exhibit 101"
    }
]
['Exhibit 101']
------
------
The purpose of this Form 10-K/A is to provide the information required to be disclosed in Part III, Items 10 through 14 and updates the information contained in 

In [71]:
df_sampled

Unnamed: 0,file_name,expl_note_gpt,labels,expl_note,rule_based_label,labels_gpt,labels_gpt3,labels_gpt4o,labels_gpt4
536,q_0001193125-12-359187.txt,The paragraph that talks about the purpose of ...,[Exhibit 101],EXPLANATORY NOTE This Amendment No. 1 to the ...,[Exhibit 101],[Exhibit 101],[Exhibit 101],[Exhibit 101],[Exhibit 101]
501,k_0001214659-15-003301.txt,The purpose of this Form 10-K/A is to provide ...,[Part III],EXPLANATORY NOTE\n\n Wright Investors' Service...,[Part III],[Part III],[],"[Part III, Exhibit 101]",[]
36,k_0001144204-09-022929.txt,The purpose of this Amendment No. 1 on Form 10...,[Part III],EXPLANATORY\nNOTE\n\nThis\nAmendment No. 1 on ...,[Part III],[Part III],[Part III],[Part III],[Part III]
884,k_0001548123-13-000255.txt,The purpose of this amended Form 10-K is to re...,[Restatements],EXPLANATORY NOTE During the preparation of th...,[Restatements],[Restatements],[Restatements],[Restatements],[Restatements]
902,q_0001193125-09-016547.txt,The Form 10-Q/A is being filed solely to corre...,[Errors],Explanatory Note The Form 10-Q/A is being fil...,"[Restatements, Errors]","[Errors, Restatements]","[Errors, Restatements]","[Errors, Restatements]","[Errors, Restatements]"
35,k_0000070530-10-000086.txt,The purpose of this Amendment No. 1 is to add ...,"[Errors, Part III]",EXPLANATORY NOTE\n\nWe are filing this Amendme...,"[Errors, Part III]",[Part III],"[Part III, Errors]","[Part III, Errors]","[Part III, Errors]"
895,q_0001144204-08-051465.txt,The registrant is filing this Amendment No. 1 ...,[Restatements],Explanatory\n Note\n\nThe\n registra...,[Errors],[],[],[],[]
296,k_0001035704-05-000135.txt,The purpose of this Form 10-K/A is to correct ...,[Errors],EXPLANATORY NOTE\nSIGNATURES\nEXHIBIT INDEX\nC...,[Errors],[Errors],[Errors],[],[Errors]
522,q_0001213900-11-005101.txt,The sole purpose of this Form 10-Q/A is to fur...,[Exhibit 101],EXPLANATORY NOTE\n\n The sole purpose of this ...,[Exhibit 101],[],[],[],[]
822,q_0001144204-10-011777.txt,The purpose of this Amendment is to amend and ...,[Restatements],"EXPLANATORY\nNOTE\n\n EGPI\nFirecreek, Inc....",[Restatements],[],[],[],[]


In [79]:
labels_gpt_3_encoded = mlb.transform(df_sampled['labels_gpt3'])
labels_gpt_4o_encoded = mlb.transform(df_sampled['labels_gpt4o'])
labels_gpt_4_encoded = mlb.transform(df_sampled['labels_gpt4'])
labels_rule_based_encoded = mlb.transform(df_sampled['rule_based_label'])
labels_encoded = mlb.transform(df_sampled['labels'])



In [80]:
types = ['Rule Based', 'GPT-3.5', 'GPT-4o', 'GPT-4']
label4types = [labels_rule_based_encoded, labels_gpt_3_encoded, labels_gpt_4o_encoded, labels_gpt_4_encoded]
for i, type in enumerate(types):
    print(f"-----------{type}------------")
    print('Accuracy:', accuracy_score(labels_encoded, label4types[i]))
    print('Classification Report:')
    print(classification_report(labels_encoded, label4types[i], target_names=mlb.classes_))

-----------Rule Based------------
Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

      Errors       0.57      1.00      0.73         4
 Exhibit 101       1.00      0.75      0.86         4
    Part III       1.00      1.00      1.00         4
Restatements       0.78      0.78      0.78         9

   micro avg       0.78      0.86      0.82        21
   macro avg       0.84      0.88      0.84        21
weighted avg       0.82      0.86      0.83        21
 samples avg       0.75      0.85      0.78        21

-----------GPT-3.5------------
Accuracy: 0.45
Classification Report:
              precision    recall  f1-score   support

      Errors       0.75      0.75      0.75         4
 Exhibit 101       1.00      0.50      0.67         4
    Part III       1.00      0.75      0.86         4
Restatements       0.71      0.56      0.62         9

   micro avg       0.81      0.62      0.70        21
   macro avg       0.87      0.64      0.72 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [81]:
types = ['Rule Based', 'GPT-3.5', 'GPT-4o', 'GPT-4']
label4types = [labels_rule_based_encoded, labels_gpt_3_encoded, labels_gpt_4o_encoded, labels_gpt_4_encoded]
for i, type in enumerate(types):
    print(f"-----------{type}------------")
    print('Accuracy:', accuracy_score(labels_rule_based_encoded, label4types[i]))
    print('Classification Report:')
    print(classification_report(labels_rule_based_encoded, label4types[i], target_names=mlb.classes_))

-----------Rule Based------------
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

      Errors       1.00      1.00      1.00         7
 Exhibit 101       1.00      1.00      1.00         3
    Part III       1.00      1.00      1.00         4
Restatements       1.00      1.00      1.00         9

   micro avg       1.00      1.00      1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23
 samples avg       0.90      0.90      0.90        23

-----------GPT-3.5------------
Accuracy: 0.55
Classification Report:
              precision    recall  f1-score   support

      Errors       1.00      0.57      0.73         7
 Exhibit 101       0.50      0.33      0.40         3
    Part III       1.00      0.75      0.86         4
Restatements       1.00      0.78      0.88         9

   micro avg       0.94      0.65      0.77        23
   macro avg       0.88      0.61      0.71  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
