In [1]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.3-py3-none-any.whl (394 kB)
[K     |████████████████████████████████| 394 kB 1.2 MB/s 
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.3


In [2]:
!pip install BeautifulSoup4

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 1.2 MB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.2.1


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup as bs
import seaborn as sns
import re
import torch
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [4]:
train = pd.read_csv('/kaggle/input/cse4302/cse4302_train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/cse4302/cse4302_test.tsv', sep='\t')

In [5]:
def preprocessing(review, remove_stopwords=False):
    review_text = bs(review, 'html5lib').get_text()
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
        clean_review = ' '.join(words)
    else:
        clean_review = ' '.join(words)
        
    return clean_review

In [6]:
clean_train_reviews = []

for review in train['review']:
    clean_train_reviews.append(preprocessing(review, True))

clean_test_reviews = []

for review in test['review']:
    clean_test_reviews.append(preprocessing(review, True))

## Word Augmentation

In [7]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(clean_train_reviews)
len(augmented_text)

15000

In [8]:
augmented_text1 = aug.augment(augmented_text)
len(augmented_text1)

15000

In [9]:
augmented_text2 = aug.augment(augmented_text1)
augmented_text3 = aug.augment(augmented_text2)

In [10]:
l = train['sentiment'].tolist()
label = train['sentiment'].tolist()

In [11]:
clean_train_reviews.extend(augmented_text)
clean_train_reviews.extend(augmented_text1)
clean_train_reviews.extend(augmented_text2)
clean_train_reviews.extend(augmented_text3)

In [12]:
label.extend(l)
label.extend(l)
label.extend(l)
label.extend(l)

In [13]:
clean_train_df = pd.DataFrame({'review':clean_train_reviews, 'sentiment':label})
clean_test_df = pd.DataFrame({'review':clean_test_reviews})

In [14]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Conv1D,LSTM,GRU,BatchNormalization,Flatten,Dense

In [15]:
clean_train_df['sentiment'] = (clean_train_df['sentiment'] == 'positive').astype(int)

In [16]:
x_train, y_train = clean_train_df['review'], clean_train_df['sentiment']
x_test = clean_test_df['review']

In [17]:
num_words=80000
embeddings=512

In [18]:
tokenizer=Tokenizer(num_words=num_words,oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)
word_index=tokenizer.word_index
total_vocab=len(word_index)

In [19]:
print("Vocabulary of the dataset is : ",total_vocab)

Vocabulary of the dataset is :  69699


In [20]:
sequences_train=tokenizer.texts_to_sequences(x_train)
sequences_test=tokenizer.texts_to_sequences(x_test)

# max_len=max(max([len(x) for x in sequences_train]),max([len(x) for x in sequences_test]))
max_len=1000

train_padded=pad_sequences(sequences_train,maxlen=max_len)
test_padded=pad_sequences(sequences_test,maxlen=max_len)

In [21]:
X_train,X_val,Y_train,Y_val=train_test_split(train_padded,y_train,
                                             test_size=0.05,random_state=10)

In [22]:
model= keras.Sequential()
model.add(Embedding(num_words,embeddings,input_length=max_len))
model.add(Conv1D(256,10,activation='selu'))
model.add(keras.layers.Bidirectional(LSTM(128,return_sequences=True)))
model.add(keras.layers.Dropout(0.3))
model.add(LSTM(64))
model.add(keras.layers.Dropout(0.3))
model.add(Dense(2,activation='softmax'))

In [23]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
             )

In [24]:
es= EarlyStopping(monitor='val_accuracy',
                  patience=2
                 )

checkpoints=ModelCheckpoint(filepath='./',
                            monitor="val_accuracy",
                            verbose=0,
                            save_best_only=True
                           )

callbacks=[es,checkpoints]

In [25]:
history=model.fit(X_train,Y_train,validation_data=(X_val,Y_val),epochs=10,callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [26]:
pred = model.predict(test_padded)

In [27]:
submission = pd.read_csv('/kaggle/input/cse4302/sampleSubmission.csv')

In [28]:
submission['category'] = np.argmax(pred, axis=1)
submission

Unnamed: 0,id,category
0,1,1
1,2,1
2,3,1
3,4,1
4,5,0
...,...,...
34995,34996,1
34996,34997,1
34997,34998,1
34998,34999,0


In [29]:
submission.to_csv('submission.csv', index=False)