In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
train = pd.read_csv('p3_train.csv')
test = pd.read_csv('p3_test.csv')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
X_train_tagged_texts = []
for text in X_train:
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    X_train_tagged_texts.append(tags)
X_test_tagged_texts = []
for text in X_test:
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    X_test_tagged_texts.append(tags)
# Map POS tags to integers using NLTK's tagset
tagset = {"CC": 1, "CD": 2, "DT": 3, "EX": 4, "FW": 5, "IN": 6, "JJ": 7, "JJR": 8, "JJS": 9, "LS": 10, "MD": 11,
          "NN": 12, "NNS": 13, "NNP": 14, "NNPS": 15, "PDT": 16, "POS": 17, "PRP": 18, "PRP$": 19, "RB": 20,
          "RBR": 21, "RBS": 22, "RP": 23, "SYM": 24, "TO": 25, "UH": 26, "VB": 27, "VBD": 28, "VBG": 29,
          "VBN": 30, "VBP": 31, "VBZ": 32, "WDT": 33, "WP": 34, "WP$": 35, "WRB": 36}
X_train_tagged = []
for tagged_text in X_train_tagged_texts:
    tagged_text_int = []
    for word, tag in tagged_text:
        if tag in tagset:
            tagged_text_int.append(tagset[tag])
    X_train_tagged.append(tagged_text_int)

X_test_tagged = []
for tagged_text in X_test_tagged_texts:
    tagged_text_int = []
    for word, tag in tagged_text:
        if tag in tagset:
            tagged_text_int.append(tagset[tag])
    X_test_tagged.append(tagged_text_int)


max_length = max([len(seq) for seq in X_train_tagged])
max_length = max([len(seq) for seq in X_test_tagged])
# Pad sequences to a fixed length
# max_length = 12270
X_train_padded = pad_sequences(X_train_tagged, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_tagged, maxlen=max_length, padding='post')

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, SimpleRNN, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D

# RNN model
embedding_dim = 300
model = Sequential()
model.add(Embedding(input_dim=len(tagset)+1, output_dim=300, input_length=max_length))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(SimpleRNN(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dense(32, activation='tanh'))
model.add(GlobalMaxPooling1D())
model.add(Dense(11, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


# train the model
model.fit(X_train_padded, y_train, epochs=15, batch_size=32)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5777, 32)          1406304   
                                                                 
 conv1d_2 (Conv1D)           (None, 5777, 64)          6208      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 1925, 64)         0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 1925, 32)          6176      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 641, 32)          0         
 1D)                                                             
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 641, 64)          

<keras.callbacks.History at 0x7fa04197c550>

In [18]:
# tokenizer.fit_on_texts(X_test)
loss, acc = model.evaluate(X_test_padded, y_test)



In [19]:
preds = model.predict(X_test_padded)



In [20]:
preds[0]

array([2.8149372e-03, 9.9429697e-01, 4.5813277e-11, 1.0549176e-04,
       2.7363331e-03, 4.9322665e-07, 1.5290649e-08, 4.5729081e-05,
       2.6765791e-08, 2.4716111e-18, 2.1701979e-13], dtype=float32)

In [21]:
preds_new = np.argmax(preds, axis = 1)

In [22]:
preds_new = np.array(preds_new)

In [23]:
preds_new[0]

1

In [24]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new, average = 'macro')
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new, average = 'macro')
recall = recall_score(y_test, preds_new, average = 'macro')
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.6420385229243185
Precision =  0.6467518319811617
Recall =  0.6486292336253565
Accuracy =  0.6485774499473129


In [25]:
from sklearn.metrics import classification_report, roc_curve, auc, f1_score, accuracy_score, confusion_matrix
matrix = confusion_matrix(y_test, preds_new, labels = [0, 1, 2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, preds_new, labels = [0, 1, 2],digits=4))
print('confusion matrix: ', mat)

              precision    recall  f1-score   support

           0     0.8160    0.8012    0.8085       166
           1     0.8883    0.8883    0.8883       179
           2     0.3750    0.3642    0.3695       173

   micro avg     0.6961    0.6853    0.6907       518
   macro avg     0.6931    0.6845    0.6888       518
weighted avg     0.6937    0.6853    0.6895       518

confusion matrix:  [0.89261745 0.95209581 0.96923077]
