In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert-base-multilingual-uncased/vocab.txt
/kaggle/input/bert-base-multilingual-uncased/config.json
/kaggle/input/bert-base-multilingual-uncased/pytorch_model.bin
/kaggle/input/positive-neutral-new/__results__.html
/kaggle/input/positive-neutral-new/torchvision-nightly+20200325-cp36-cp36m-linux_x86_64.whl
/kaggle/input/positive-neutral-new/__notebook__.ipynb
/kaggle/input/positive-neutral-new/torch-nightly+20200325-cp36-cp36m-linux_x86_64.whl
/kaggle/input/positive-neutral-new/pytorch-xla-env-setup.py
/kaggle/input/positive-neutral-new/custom.css
/kaggle/input/positive-neutral-new/torch_xla-nightly+20200325-cp36-cp36m-linux_x86_64.whl
/kaggle/input/positive-neutral-new/model.bin
/kaggle/input/positive-neutral-new/__output__.json
/kaggle/input/fork-of-negative-neutral/__results__.html
/kaggle/input/fork-of-negative-neutral/torchvision-nightly+20200325-cp36-cp36m-linux_x86_64.whl
/kaggle/input/fork-of-negative-neutral/__notebook__.ipynb
/kaggle/input/fork-of-negative-neutral/

In [2]:
df=pd.read_csv("/kaggle/input/avishek-hotel/sentiments.csv",encoding='ISO-8859–1')

In [3]:
t, valid, test = np.split(df.sample(frac=1), [int(.7*len(df)), int(.9*len(df))])

In [4]:
test=test.rename(columns={"text": "comment_text", "review": "rating"})

In [5]:
test

Unnamed: 0,comment_text,rating
26005,I was traveling with my brother visiting Banff...,-1
15467,Stayed at the Holiday Inn for 2 nights upon ar...,1
13852,"On my recent trip to Italy, we stayed at this ...",1
3902,My husband &amp; I and several family members ...,0
47497,"Been here many times, about the same this time...",1
...,...,...
57421,"The hotel was quiet, exterior condition was po...",0
54982,the pool was clean the was just what i wanted,1
24061,We booked to stay at the Wilshire Crest in Apr...,-1
30910,My out of town friend and I just returned from...,1


In [6]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
import sys

In [7]:
# a class to convert the dataset into lowercase
class BERTBaseUncased(nn.Module):
    #initialization function taking the path as a parameter
        def __init__(self, path):
            super(BERTBaseUncased, self).__init__()
            self.bert_path = path
            #Instantiate a pretrained pytorch model from a pre-trained model configuration.
            self.bert = transformers.BertModel.from_pretrained(self.bert_path)
            #randomly sets elements to zero to prevent overfitting.
            self.bert_drop = nn.Dropout(0.3) 
            #linear/outer layer as bert base model has 768*2 output features (bert base and multilingual) and 1 for binary classification
            self.out = nn.Linear(768 * 2, 1)
        
        
        def forward(self,ids,mask,token_type_ids):
            #o1 ,is the last hidden and we neglect pooler outputs of the bert 
            o1,_ = self.bert(ids,attention_mask=mask,token_type_ids=token_type_ids)
            
            apool = torch.mean(o1, 1)
            mpool, _ = torch.max(o1, 1)
            cat = torch.cat((apool, mpool), 1)

            bo = self.bert_drop(cat)
            output = self.out(bo)
            return output

In [8]:
class BERTDatasetTest:
    def __init__(self, comment_text, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    #getting the total length
    def __len__(self):
        return len(self.comment_text)
    
    #returns the token type ids from the dataset of the comment_text
    def __getitem__(self, item):
        #checking for the digits 
        comment_text = str(self.comment_text[item])
        #removing all the unwanted spaces
        comment_text = " ".join(comment_text.split())

        #encode 2 strings at a time hence 2nd string is none and add the CLS token
        inputs = self.tokenizer.encode_plus(comment_text,None,add_special_tokens=True,max_length=self.max_length,)
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {'ids': torch.tensor(ids, dtype=torch.long),'mask': torch.tensor(mask, dtype=torch.long),'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)}

In [9]:
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-multilingual-uncased/",do_lower_case=True)

In [10]:
device = "cuda"
model = BERTBaseUncased(path="../input/bert-base-multilingual-uncased/").to(device)
model.load_state_dict(torch.load("../input/positive-neutral/model.bin"))
model.eval()


BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [11]:
model1 = BERTBaseUncased(path="../input/bert-base-multilingual-uncased/").to(device)
model1.load_state_dict(torch.load("../input/fork-of-negative-neutral/model.bin"))
model1.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [12]:
model2 = BERTBaseUncased(path="../input/bert-base-multilingual-uncased/").to(device)
model2.load_state_dict(torch.load("../input/positive-negative/model.bin"))
model2.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [13]:
modelnew = BERTBaseUncased(path="../input/bert-base-multilingual-uncased/").to(device)
modelnew.load_state_dict(torch.load("../input/negative-neutral-neutral-1/model.bin"))
modelnew.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [14]:
modelnew1 = BERTBaseUncased(path="../input/bert-base-multilingual-uncased/").to(device)
modelnew1.load_state_dict(torch.load("../input/positive-neutral-new/model.bin"))
modelnew1.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

## For positive and neutral

In [15]:
df_test=test
test_dataset = BERTDatasetTest(comment_text=df_test.comment_text.values,tokenizer=tokenizer,max_length=182)

data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=64,drop_last=False,num_workers=4,shuffle=False)

#code to give the outputs
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(ids=ids,mask=mask,token_type_ids=token_type_ids)

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

92it [00:32,  2.79it/s]


In [16]:
fin_outputs_posneu = [item for sublist in fin_outputs for item in sublist]

In [17]:
df_test=test
test_dataset = BERTDatasetTest(comment_text=df_test.comment_text.values,tokenizer=tokenizer,max_length=182)

data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=64,drop_last=False,num_workers=4,shuffle=False)

#code to give the outputs
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = modelnew1(ids=ids,mask=mask,token_type_ids=token_type_ids)

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

92it [00:31,  2.88it/s]


In [18]:
fin_outputs_posneunew = [item for sublist in fin_outputs for item in sublist]

## For negative and neutral

In [19]:
df_test=test
test_dataset = BERTDatasetTest(comment_text=df_test.comment_text.values,tokenizer=tokenizer,max_length=182)

data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=64,drop_last=False,num_workers=4,shuffle=False)

#code to give the outputs
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model1(ids=ids,mask=mask,token_type_ids=token_type_ids)

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

92it [00:31,  2.90it/s]


In [20]:
fin_outputs_negneu = [item for sublist in fin_outputs for item in sublist]

In [21]:
df_test=test
test_dataset = BERTDatasetTest(comment_text=df_test.comment_text.values,tokenizer=tokenizer,max_length=182)

data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=64,drop_last=False,num_workers=4,shuffle=False)

#code to give the outputs
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = modelnew(ids=ids,mask=mask,token_type_ids=token_type_ids)

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

92it [00:32,  2.84it/s]


In [22]:
fin_outputs_negneunew = [item for sublist in fin_outputs for item in sublist]

## for positive negative

In [23]:
df_test=test
test_dataset = BERTDatasetTest(comment_text=df_test.comment_text.values,tokenizer=tokenizer,max_length=182)

data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=64,drop_last=False,num_workers=4,shuffle=False)

#code to give the outputs
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model2(ids=ids,mask=mask,token_type_ids=token_type_ids)

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

92it [00:31,  2.91it/s]


In [24]:
fin_outputs_posneg = [item for sublist in fin_outputs for item in sublist]

In [25]:
fin_outputs_posneg

[2.5406436418048873e-25,
 1.0,
 1.0,
 1.0,
 0.9999788999557495,
 0.9999996423721313,
 1.0,
 0.9999754428863525,
 1.0,
 0.9999998807907104,
 1.0,
 0.9999998807907104,
 4.4674932569771e-35,
 1.0,
 1.9156630516012905e-35,
 0.9999998807907104,
 1.0,
 1.0,
 0.9999971389770508,
 1.0,
 1.9348018559865413e-35,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9999837875366211,
 0.9999995231628418,
 1.0,
 1.0,
 0.9999998807907104,
 0.9999769926071167,
 1.0,
 1.0,
 0.9999998807907104,
 1.0,
 0.9999929666519165,
 0.9999998807907104,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9999998807907104,
 1.0,
 0.999993085861206,
 0.9999979734420776,
 1.0,
 0.9999998807907104,
 0.9999998807907104,
 1.0,
 0.9999951124191284,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9999960660934448,
 1.0,
 0.9999997615814209,
 1.0,
 0.9999969005584717,
 1.0,
 1.0,
 5.2851776454432526e-36,
 0.9999817609786987,
 1.0,
 0.9999974966049194,
 0.9999997615814209,
 0.9999998807907104,
 0.999996542930603,
 1.0,
 0.9999852180480957,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9999

## Merging the models

In [26]:
r=test["rating"]
r=list(r)
r

[-1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 1,
 0,
 -1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 -1,
 1,
 1,
 1,
 0,
 1,
 1,
 -1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 -1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 -1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 

In [27]:
res=[]
for i in fin_outputs_posneg:
    res.append(i)
    
    

In [28]:

for i in range(len(fin_outputs_negneu)) :
    if fin_outputs_negneu[i]<0.99 and fin_outputs_posneu[i]<0.99:
        res[i]=0
            

In [29]:
result=[]
for i in res :
    if i>0.99:
        result.append(1)
    elif i==0:
        result.append(0)
    else:
        result.append(-1)

In [30]:
for i in range(len(result)):
    result[i] = int(result[i])

In [31]:
from sklearn.metrics import classification_report
print(classification_report(r,result))

              precision    recall  f1-score   support

          -1       0.76      0.79      0.77       425
           0       0.75      0.53      0.62       771
           1       0.93      0.98      0.95      4666

    accuracy                           0.90      5862
   macro avg       0.81      0.77      0.78      5862
weighted avg       0.90      0.90      0.90      5862



In [32]:
from sklearn.metrics import accuracy_score
print(accuracy_score(result, r))
acc1=accuracy_score(result, r)

0.9036165131354487


In [33]:
r=test["rating"]
r=list(r)

In [34]:
res=[]
for i in fin_outputs_posneg:
    res.append(i)
    

In [35]:

for i in range(len(fin_outputs_negneunew)) :
    if fin_outputs_negneunew[i]>0.8 and fin_outputs_posneunew[i]<0.8:
        res[i]=0
result2=[]
for i in res :
    if i>0.99:
        result2.append(1)
    elif i==0:
        result2.append(0)
    else:
        result2.append(-1)

In [36]:
from sklearn.metrics import classification_report
print(classification_report(r,result2))

              precision    recall  f1-score   support

          -1       0.77      0.80      0.78       425
           0       0.78      0.62      0.69       771
           1       0.95      0.97      0.96      4666

    accuracy                           0.92      5862
   macro avg       0.83      0.80      0.81      5862
weighted avg       0.91      0.92      0.91      5862



In [37]:
from sklearn.metrics import accuracy_score
acc2=accuracy_score(r, result2 )
print(acc2)

0.9152166496076425


In [38]:
accsum=acc1+acc2

In [39]:
def weighted_products(k1,k2):
    wproducts=[]
    for i in range(0,len(k1)):
        wproducts.append((k1[i]*(acc1/accsum))+(k2[i]*(acc2/accsum)))
    return wproducts

In [40]:
we=weighted_products(result,result2)

In [41]:
w=[]
for i in we:
    w.append(round(i))

In [42]:
from sklearn.metrics import accuracy_score
print(accuracy_score(r, w ))

0.9152166496076425


In [43]:
from sklearn.metrics import classification_report
print(classification_report(r,w))

              precision    recall  f1-score   support

          -1       0.77      0.80      0.78       425
           0       0.78      0.62      0.69       771
           1       0.95      0.97      0.96      4666

    accuracy                           0.92      5862
   macro avg       0.83      0.80      0.81      5862
weighted avg       0.91      0.92      0.91      5862

