In [1]:
import pandas as pd

In [None]:
import torch
embed = torch.load('/content/drive/MyDrive/nlp/10k_word_embeddings.tar')
vocab_to_int = torch.load('/content/drive/MyDrive/nlp/vocab_to_int.tar')

In [2]:
X_train = pd.read_csv("/content/drive/MyDrive/nlp/training_content_all.csv").dropna()
X_train = X_train[X_train["form_type"] != "4"]
print(len(X_train))
y_train = X_train["label"]
X_train.head(3)

122429


Unnamed: 0,file_number,ticker,form_type,label,content
10,10,UAL,8-K,1,earliest event registrant specified principal ...
12,12,NRG,SC 13G/A,0,true false false false false false option gran...
13,13,MRO,SC 13G/A,0,true false false false false false option gran...


In [3]:
X_train.dtypes

file_number     int64
ticker         object
form_type      object
label           int64
content        object
dtype: object

In [None]:
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences


# use keras to create a Tokenizer object
tokenizer = text.Tokenizer(num_words=10000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(X_train['content']))
tokenized_texts = tokenizer.texts_to_sequences(X_train['content'])
X = sequence.pad_sequences(tokenized_texts, maxlen=500)

In [None]:
import numpy as np
embedding_matrix = np.zeros((len(tokenizer.index_word)+1, 300))
for word, i in tokenizer.word_index.items():
  if i > len(tokenizer.word_index):
    continue
  embedding_vector = embed[vocab_to_int[word]]
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding
from keras.initializers import Constant

model = Sequential()

embedding_size = 300
model.add(Embedding(len(tokenizer.index_word)+1, embedding_size,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=500, trainable=False))

model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          10988100  
                                                                 
 lstm (LSTM)                 (None, 32)                42624     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 11,031,813
Trainable params: 43,713
Non-trainable params: 10,988,100
_________________________________________________________________


In [None]:
model.fit(X, X_train['label'], epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f38980f9a10>

In [None]:
X_test = pd.read_csv("/content/drive/MyDrive/nlp/test_content_all.csv").dropna()
X_test = X_test[X_test["form_type"] != "4"]
tokenized_texts_test = tokenizer.texts_to_sequences(X_test['content'])
X_test_input = sequence.pad_sequences(tokenized_texts_test, maxlen=500)

In [None]:
y_pred = model.predict(X_test_input)
X_test["preds"] = y_pred

In [None]:
import numpy as np
X_test["preds_prob"] = X_test["preds"]
X_test["preds"] = np.where(X_test["preds_prob"] >= 0.5, 1, 0)
X_test

Unnamed: 0,file_number,ticker,form_type,label,content,preds,preds_prob
7,274787,ARE,8-K,0.0,shares shares false false false false false pa...,0,0.451487
8,274788,ED,3,0.0,undersigned constitutes appoints signing power...,0,0.248348
9,274789,ED,3,0.0,undersigned constitutes appoints signing power...,0,0.248348
17,274797,CNC,8-K,0.0,false false false false false flag true conten...,0,0.450475
18,274798,CNC,8-K,0.0,false false false false false flag true conten...,0,0.450475
...,...,...,...,...,...,...,...
61546,527639,FRLG,8-K,0.0,corporation organized existing accordance prov...,0,0.450355
61547,527640,FRLG,FWP,0.0,pursuant dated notes bear paid notes stated ma...,0,0.299994
61548,527641,FRLG,FWP,0.0,pursuant dated securities unsecured notes issu...,0,0.416637
61549,527642,FRLG,FWP,0.0,pursuant dated notes bear paid notes stated ma...,0,0.356032


In [None]:
X_test.to_csv("/content/drive/MyDrive/nlp/rnn_embedding.csv")