# Purpose
get the training data for an ML model for predicting the category for a given SEC note

# Import Libs

In [36]:
import pandas as pd
import re
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report


# Import data

In [37]:
df = pd.read_csv("../output/labeled_data.csv")

In [38]:
df1 = pd.read_csv("../output/full_data.csv")

In [39]:
df.head(2)

Unnamed: 0,file_name,expl_note_gpt,labels,expl_note,rule_based_label
0,k_0001193125-17-189495.txt,The principal purpose of this Amendment is to ...,"['Part III', 'Disclosure', 'Restatements']",EXPLANATORY NOTE \nThe registrant is filing th...,"['Restatements', 'Part III', 'Disclosure']"
1,q_0001294250-12-000007.txt,The sole purpose of this Amendment No. 1 to Ra...,['Exhibit 101'],EXPLANATORY NOTE\n\nThe sole purpose of this A...,['Exhibit 101']


In [40]:
def convert_to_list(entry):
    try:
        return ast.literal_eval(entry)
    except:
        pass

In [41]:
df['labels'] = df.labels.apply(convert_to_list)
df['rule_based_label'] = df.rule_based_label.apply(convert_to_list)

In [42]:
df1['labels'] = df1.labels.apply(convert_to_list)
df1['rule_based_label'] = df1.rule_based_label.apply(convert_to_list)

In [43]:
df1 = df1[(df1.labels.isna()) & (df1.rule_based_label.apply(lambda x: x!=['None of the above']))]

In [44]:
df1.rule_based_label.value_counts()

rule_based_label
[Errors]                                                                                7562
[Restatements]                                                                          5924
[Exhibit 101]                                                                           5630
[Part III]                                                                              5245
[Restatements, Errors]                                                                  3181
                                                                                        ... 
[Errors, Part III, Filing the audit report, Report on Internal Controls, Disclosure]       1
[Errors, Exhibit 101, Disclosure, Amend financial statements]                              1
[Signatures, Disclosure, Amend financial statements]                                       1
[Errors, Part III, Amend financial statements]                                             1
[Restatements, Disclosure, Amend financial statements

In [45]:
df1.labels.fillna(df1.rule_based_label,inplace=True)

In [46]:
df1.head(2)

Unnamed: 0,file_name,expl_note_gpt,labels,expl_note,rule_based_label
19,q_0001264931-11-000480.txt,The purpose of this Amendment No. 1 to the Qua...,[Exhibit 101],EXPLANATORY NOTE \nThis Amendment No. 1 to the...,[Exhibit 101]
21,k_0001264931-15-000058.txt,The purpose of this Amendment No. 1 to Annual ...,[Errors],EXPLANATORY NOTE \n\n This Amendment No. 1 to ...,[Errors]


In [47]:
print(df.shape)
df = df[df.labels.apply(lambda x: x!=['None of the above'])]
print(df.shape)

(1192, 5)
(1133, 5)


In [48]:
train_df, test_df = train_test_split(df, test_size=0.5)


In [49]:
df1.shape

(36298, 5)

In [15]:
train_df = pd.concat([train_df,df1.head(3000)])

In [16]:
train_df.shape

(3566, 5)

In [17]:
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# Function to get embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings



In [18]:
# Convert `expl_note_gpt` to embeddings
train_embeddings = get_embeddings(train_df['expl_note_gpt'].tolist())

In [19]:
import pickle

# Save embeddings to a pickle file
with open('train_full_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)

In [20]:
test_embeddings = get_embeddings(test_df['expl_note_gpt'].tolist())

In [21]:
# Save embeddings to a pickle file
with open('test_full_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)

import pickle

# Load embeddings from the pickle file
with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


In [22]:
# Step 2: Encode `rule_based_label` and `labels` into arrays of size 11
mlb = MultiLabelBinarizer()
train_rule_based_label_encoded = mlb.fit_transform(train_df['rule_based_label'])
train_labels_encoded = mlb.transform(train_df['labels'])

In [23]:
# Step 2: Encode `rule_based_label` and `labels` into arrays of size 11
mlb = MultiLabelBinarizer()
test_rule_based_label_encoded = mlb.fit_transform(test_df['rule_based_label'])
test_labels_encoded = mlb.transform(test_df['labels'])

In [24]:
# Step 3: Append `rule_based_label` encoding to the embeddings
train_features = torch.tensor(train_embeddings)
train_rule_based_label_features = torch.tensor(train_rule_based_label_encoded)
train_combined_features = torch.cat((train_features, train_rule_based_label_features), dim=1).numpy()


In [25]:
# Step 3: Append `rule_based_label` encoding to the embeddings
test_features = torch.tensor(test_embeddings)
test_rule_based_label_features = torch.tensor(test_rule_based_label_encoded)
test_combined_features = torch.cat((test_features, test_rule_based_label_features), dim=1).numpy()

# Step 4: Train a model
X_train = train_combined_features
X_test = test_combined_features
y_train = train_labels_encoded
y_test = test_labels_encoded


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Define the neural network
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc4(x))
        return x

# Parameters
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]
batch_size = 64
num_epochs = 40000

# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss, and optimizer
model = SimpleNN(input_dim, output_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i, (batch_X, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if epoch % 1000 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


In [28]:
# Evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test)
    y_pred = (y_pred > 0.001).float()

In [29]:
# Convert to numpy arrays for evaluation
y_pred = y_pred.numpy()

In [30]:
y_test = y_test.numpy()

In [31]:
# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


Accuracy: 0.8606701940035273
Classification Report:
                             precision    recall  f1-score   support

 Amend financial statements       1.00      1.00      1.00        33
                 Disclosure       1.00      1.00      1.00        70
                     Errors       0.73      0.93      0.82       121
                Exhibit 101       0.94      0.98      0.96        63
    Filing the audit report       1.00      1.00      1.00        39
          None of the above       0.00      0.00      0.00         0
                   Part III       0.90      0.91      0.91        70
Report on Internal Controls       1.00      1.00      1.00        63
 Responding to SEC comments       1.00      1.00      1.00        46
               Restatements       0.86      0.91      0.88       106
                 Signatures       1.00      0.23      0.38        13

                  micro avg       0.90      0.94      0.92       624
                  macro avg       0.86      0.82 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
y_test[0]

array([0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

In [33]:
y_pred[0]

array([0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

In [34]:
# Combine results in a dataframe
df_results = test_df.copy()
df_results['rule_based_label'] = test_df['rule_based_label'].apply(lambda x: list(x))
df_results['actual_labels'] = mlb.inverse_transform(y_test)
df_results['predicted_labels'] = mlb.inverse_transform(y_pred)

# Convert tuples to lists for better readability
df_results['actual_labels'] = df_results['actual_labels'].apply(lambda x: list(x))
df_results['predicted_labels'] = df_results['predicted_labels'].apply(lambda x: list(x))



In [35]:
df_results[df_results['labels']!=df_results['predicted_labels']]

Unnamed: 0,file_name,expl_note_gpt,labels,expl_note,rule_based_label,actual_labels,predicted_labels
764,q_0001213900-22-078475.txt,The purpose of this Amendment No. 1 is to amen...,[Restatements],EXPLANATORY NOTE\n\nGesher I Acquisition Corp....,"[Restatements, Errors]",[Restatements],"[Errors, Restatements]"
386,q_0001193125-12-443958.txt,The purpose of this Amendment No. 1 to our Qua...,"[Exhibit 101, Errors]",Explanatory Note The purpose of this Amendmen...,"[Errors, Exhibit 101]","[Errors, Exhibit 101]","[Errors, Exhibit 101]"
579,q_0001193125-21-320268.txt,The purpose of this Amendment is to refile the...,[Signatures],EXPLANATORY NOTE \n\n We are filing this Ame...,[None of the above],[Signatures],[Errors]
885,q_0001019687-15-001871.txt,The purpose of this amendment is to file a res...,[Restatements],EXPLANATORY NOTE \n\n This Amendment No. 1 to ...,"[Restatements, Errors]",[Restatements],"[Errors, Restatements]"
286,q_0001255823-12-000005.txt,The sole purpose of this Amendment No. 1 on Fo...,"[Exhibit 101, Errors]",EXPLANATORY NOTE \nThe sole purpose of this Am...,"[Errors, Exhibit 101]","[Errors, Exhibit 101]","[Errors, Exhibit 101]"
...,...,...,...,...,...,...,...
804,k_0001437749-13-011098.txt,The Company is filing this Amendment No. 1 to ...,[Restatements],EXPLANATORY NOTE\n\nSubsequent to filing its A...,"[Restatements, Errors]",[Restatements],"[Errors, Restatements]"
1156,k_0001193125-17-280836.txt,The purpose of this Amendment is to amend and ...,[Report on Internal Controls],Explanatory Note\n\n2\n\nPART II\n\n ITEM 9A.\...,"[Restatements, Part III, Report on Internal Co...",[Report on Internal Controls],"[Part III, Report on Internal Controls]"
568,k_0001144204-13-035058.txt,The purpose of this Amendment No. 1 is to file...,[Errors],EXPLANATORY NOTE\n\nGBS Enterprises Incorporat...,"[Errors, Exhibit 101]",[Errors],"[Errors, Exhibit 101]"
572,k_0001418819-21-000016.txt,The purpose of this Amendment No. 1 on Form 10...,"[Signatures, Disclosure]",EXPLANATORY NOTE\nThis Amendment No. 1 on Form...,"[Signatures, Disclosure]","[Disclosure, Signatures]","[Disclosure, Signatures]"
