In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ["MKL_NUM_THREADS"] = "4"
os.environ["OPENBLAS_NUM_THREADS"] = "4"

In [6]:
import numpy as np
import pandas as pd

In [23]:
df = pd.read_csv("/content/all.csv")
train = pd.read_csv("/content/train.csv")
val = pd.read_csv("/content/val.csv")
test = pd.read_csv("/content/test.csv")

In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Download the stopwords resource
import nltk
nltk.download('stopwords')

# Your code
temp = CountVectorizer(min_df=2, max_df=0.7, stop_words=stopwords.words('english'))
t1 = temp.fit_transform(df['Sentence'].values.astype('U')).toarray()
vocabulary = temp.get_feature_names_out()
vectorizer = CountVectorizer(min_df=2, max_df=0.7, stop_words=stopwords.words('english'), vocabulary=vocabulary)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
X_train = vectorizer.fit_transform(train['Sentence'].values.astype('U')).toarray()
X_val = vectorizer.fit_transform(val['Sentence'].values.astype('U')).toarray()
X_test = vectorizer.fit_transform(test['Sentence'].values.astype('U')).toarray()

In [28]:
y_train = train['Label'].values
y_val = val['Label'].values
y_test = test['Label'].values

In [29]:
print('X_train: ', X_train.shape)
print('X_val: ', X_val.shape)
print('X_test: ', X_test.shape)

X_train:  (26979, 10319)
X_val:  (3372, 10319)
X_test:  (3374, 10319)


In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


def performence(y_test, y_pred):
    a = accuracy_score(y_test, y_pred)
    precision, recall, fscore, _ = \
        precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Accuracy : {:.4f}".format(a))
    print("Precision : {:.4f}".format(precision))
    print("Recall : {:.4f}".format(recall))
    print("F-Score : {:.4f}".format(fscore))

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [32]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


set_seed(108)

X_train = torch.from_numpy(X_train)
X_val = torch.from_numpy(X_val)
X_test = torch.from_numpy(X_test)

y_train = torch.from_numpy(y_train)
y_val = torch.from_numpy(y_val)

In [None]:
class LSTM(nn.Module):
    def __init__(self, in_dim=10319, dropout=0.0):
        super(LSTM, self).__init__()
        self.dropout = dropout

        self.lstm = nn.LSTM(in_dim, 512, batch_first=True)

        self.classifier = nn.Sequential(
            nn.Linear(in_features=512, out_features=64),
            nn.LeakyReLU(inplace=True),
            nn.Dropout(self.dropout),
            nn.Linear(in_features=64, out_features=2),
        )

    def forward(self, x):
        x = torch.unsqueeze(x, 1)
        ou, _ = self.lstm(x)
        x = torch.squeeze(ou, 1)
        return self.classifier(x)


in_dim = X_train.shape[1]
dropout = 0.01
lr = 2e-6

lstm = LSTM(in_dim, dropout)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(lstm.parameters(), lr=lr,
                      momentum=0.9, weight_decay=5e-4)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

epoch = 50
batch = 100

max_loss = 1e5

for i in range(0, epoch):
    lstm.train()
    for j in range(0, batch):
        X_in = X_train[j:j + batch]
        y_true = y_train[j:j + batch]
        pred = lstm(X_in.float())
        loss = criterion(pred, y_true)
        loss.backward()
        optimizer.step()
    lstm.eval()
    pred = lstm(X_val.float())
    loss = criterion(pred, y_val)
    lo = loss.item()
    if lo < max_loss:
        max_loss = lo
        dic = lstm.state_dict()
    print('Iter:{:d}, Loss:{:.2f}'.format(i, lo))

lstm.load_state_dict(dic)
pred = lstm(X_test.float())
prediction = torch.max(F.softmax(pred, dim=1), 1)[1].detach().cpu().numpy()
performence(y_test, prediction)
lstm.cpu()

Iter:0, Loss:0.67
Iter:1, Loss:0.66
Iter:2, Loss:0.65
Iter:3, Loss:0.64
Iter:4, Loss:0.64
Iter:5, Loss:0.65
Iter:6, Loss:0.67
Iter:7, Loss:0.68
Iter:8, Loss:0.69
Iter:9, Loss:0.69
Iter:10, Loss:0.68
Iter:11, Loss:0.66
Iter:12, Loss:0.64
Iter:13, Loss:0.62
Iter:14, Loss:0.61
Iter:15, Loss:0.61
Iter:16, Loss:0.61
Iter:17, Loss:0.62
Iter:18, Loss:0.60
Iter:19, Loss:0.56
Iter:20, Loss:0.52
Iter:21, Loss:0.50
Iter:22, Loss:0.50
Iter:23, Loss:0.51
Iter:24, Loss:0.50
Iter:25, Loss:0.47
Iter:26, Loss:0.41
Iter:27, Loss:0.34
Iter:28, Loss:0.30
Iter:29, Loss:0.32
Iter:30, Loss:0.37
Iter:31, Loss:0.29
Iter:32, Loss:0.27
Iter:33, Loss:0.33
Iter:34, Loss:0.39
Iter:35, Loss:0.42
Iter:36, Loss:0.40
Iter:37, Loss:0.33
Iter:38, Loss:0.26
Iter:39, Loss:0.22
Iter:40, Loss:0.32
Iter:41, Loss:0.22
Iter:42, Loss:0.27
Iter:43, Loss:0.40
Iter:44, Loss:0.53
Iter:45, Loss:0.63
Iter:46, Loss:0.67
Iter:47, Loss:0.67
Iter:48, Loss:0.60
Iter:49, Loss:0.47
Accuracy : 0.9215
Precision : 0.9294
Recall : 0.9215
F-Score

LSTM(
  (lstm): LSTM(10319, 512, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=64, bias=True)
    (1): LeakyReLU(negative_slope=0.01, inplace=True)
    (2): Dropout(p=0.01, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torchvision.models as models
lstm1=models.alexnet(pretrained=True)
torch.save(lstm1,'/content/drive/MyDrive/Colab Notebooks/SQL project/lstm1.pt')