In [1]:
import pandas as pd

In [2]:
import torch
embed = torch.load('/content/drive/MyDrive/nlp/10k_word_embeddings.tar')
vocab_to_int = torch.load('/content/drive/MyDrive/nlp/vocab_to_int.tar')

In [4]:
X_train = pd.read_csv("/content/drive/MyDrive/nlp/training_content_all.csv").dropna()
X_train = X_train[X_train["form_type"] != "4"]
X_train = X_train.drop("label", axis=1)

labels = pd.read_csv("/content/drive/MyDrive/nlp/mapping_with_label_5d.csv")
X_train = X_train.merge(labels, on=["file_number"])

print(len(X_train))
y_train = X_train["label"]
X_train.head(3)

122429


Unnamed: 0,file_number,ticker,form_type,content,label
0,10,UAL,8-K,earliest event registrant specified principal ...,1.0
1,12,NRG,SC 13G/A,true false false false false false option gran...,0.0
2,13,MRO,SC 13G/A,true false false false false false option gran...,1.0


In [5]:
set(X_train["label"])

{0.0, 1.0}

In [6]:
X_train["label"] = X_train["label"].astype(int)

In [7]:
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences


# use keras to create a Tokenizer object
tokenizer = text.Tokenizer(num_words=10000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(X_train['content']))
tokenized_texts = tokenizer.texts_to_sequences(X_train['content'])
X = sequence.pad_sequences(tokenized_texts, maxlen=500)

In [8]:
import numpy as np
embedding_matrix = np.zeros((len(tokenizer.index_word)+1, 300))
for word, i in tokenizer.word_index.items():
  if i > len(tokenizer.word_index):
    continue
  embedding_vector = embed[vocab_to_int[word]]
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [9]:
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding
from keras.initializers import Constant

model = Sequential()

embedding_size = 300
model.add(Embedding(len(tokenizer.index_word)+1, embedding_size,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=500, trainable=False))

model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [10]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          10988100  
                                                                 
 lstm (LSTM)                 (None, 32)                42624     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 11,031,813
Trainable params: 43,713
Non-trainable params: 10,988,100
_________________________________________________________________


In [11]:
model.fit(X, X_train['label'], epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2e8a88bf10>

In [13]:
X_test = pd.read_csv("/content/drive/MyDrive/nlp/test_content_all.csv").dropna()
X_test = X_test[X_test["form_type"] != "4"]
X_test = X_test.drop("label", axis=1)
X_test = X_test.merge(labels, on=["file_number"])
X_test["label"] = X_test["label"].astype(int)

tokenized_texts_test = tokenizer.texts_to_sequences(X_test['content'])
X_test_input = sequence.pad_sequences(tokenized_texts_test, maxlen=500)

In [14]:
y_pred = model.predict(X_test_input)
X_test["preds"] = y_pred

In [15]:
import numpy as np
X_test["preds_prob"] = X_test["preds"]
X_test["preds"] = np.where(X_test["preds_prob"] >= 0.5, 1, 0)
X_test

Unnamed: 0,file_number,ticker,form_type,content,label,preds,preds_prob
0,274787,ARE,8-K,shares shares false false false false false pa...,0,0,0.478854
1,274788,ED,3,undersigned constitutes appoints signing power...,0,0,0.440295
2,274789,ED,3,undersigned constitutes appoints signing power...,0,0,0.440295
3,274797,CNC,8-K,false false false false false flag true conten...,1,0,0.484316
4,274798,CNC,8-K,false false false false false flag true conten...,1,0,0.484316
...,...,...,...,...,...,...,...
32564,527639,FRLG,8-K,corporation organized existing accordance prov...,1,0,0.484697
32565,527640,FRLG,FWP,pursuant dated notes bear paid notes stated ma...,1,0,0.470390
32566,527641,FRLG,FWP,pursuant dated securities unsecured notes issu...,1,0,0.402382
32567,527642,FRLG,FWP,pursuant dated notes bear paid notes stated ma...,1,0,0.472429


In [16]:
X_test.to_csv("/content/drive/MyDrive/nlp/rnn_embedding_5d.csv")