In [1]:
import os
from transformers import pipeline
import torch
import numpy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['HF_HOME'] = r'D:\models'
os.environ['HF_HUB_CACHE'] = r'D:\models'

In [3]:
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Number of GPUs: 1
Current GPU: 0
GPU Name: NVIDIA GeForce GTX 1650


In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, cache_dir=r'D:\models')
model = AutoModelForSequenceClassification.from_pretrained(MODEL, cache_dir = r'D:\models')



In [6]:
example = "Your sentence goes here"

In [7]:
encoded_text = tokenizer(example, return_tensors='pt')
encoded_text

{'input_ids': tensor([[    0, 12861,  3645,  1411,   259,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [8]:
model.to('cuda')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [9]:
example = "I bought this charger in Jul 2003 and it worked OK for a while. The design is nice and convenient. However, after about a year, the batteries would not hold a charge. Might as well just get alkaline disposables, or look elsewhere for a charger that comes with batteries that have better staying power."

In [10]:
encoded_text = tokenizer(example, return_tensors='pt').to('cuda:0')
output = model(**encoded_text)
tensor_scores = output[0][0].detach().tolist()
numpy_scores = numpy.array(tensor_scores)
scores = softmax(numpy_scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}


In [11]:
scores_dict

{'roberta_neg': 0.1241212761815136,
 'roberta_neu': 0.3448085860542891,
 'roberta_pos': 0.5310701377641973}

In [12]:
scores

array([0.12412128, 0.34480859, 0.53107014])

In [13]:
def polarity_scores(example):
    encoded_text = tokenizer(example, return_tensors='pt').to('cuda:0')
    output = model(**encoded_text)
    tensor_scores = output[0][0].detach().tolist()
    numpy_scores = numpy.array(tensor_scores)
    scores = softmax(numpy_scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [14]:
polarity_scores('Great CD My lovely Pat has one of the GREAT voiceover')

{'roberta_neg': 0.0019563858431247694,
 'roberta_neu': 0.01577455251309918,
 'roberta_pos': 0.982269061643776}

In [31]:
def get_verdict(example) -> int:
    if polarity_scores(example)['roberta_neg'] >= 0.5:
        return 1
    elif polarity_scores(example)['roberta_pos'] >= 0.5:
        return 2
    else: 
        return 2 #made neutral as positive

In [16]:
import pandas as pd
df = pd.read_csv(r'D:\Dev\DS_ML\amazon_review_polarity_csv\train.csv', names = ['verdict', 'header', 'review'])
df['merged'] = df['header'] + ' ' + df['review']
df.drop(['header', 'review'], axis = 1, inplace = True)
df = df.dropna(subset=['merged'], axis=0)
data = df[:100]

In [17]:
data

Unnamed: 0,verdict,merged
0,2,Stuning even for the non-gamer This sound trac...
1,2,The best soundtrack ever to anything. I'm read...
2,2,Amazing! This soundtrack is my favorite music ...
3,2,Excellent Soundtrack I truly like this soundtr...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
95,1,Very Not Worth Your Time The book was wriiten ...
96,2,"Very fun and educational Trains, shapes and pe..."
97,1,Ludicrous and silly I remember getting this bo...
98,2,Artistry I think that the Deodato concerts are...


In [32]:
data['predicted'] = data['merged'].apply(get_verdict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['predicted'] = data['merged'].apply(get_verdict)


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   verdict    100 non-null    int64 
 1   merged     100 non-null    object
 2   predicted  100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.2+ KB


In [29]:
data['merged'][99]

'Caution! These tracks are not the "original" versions but are re-recorded versions. So, whether the tracks are "remastered" or not is irrelevant.'

In [30]:
polarity_scores(data['merged'][99])

{'roberta_neg': 0.2853700267474118,
 'roberta_neu': 0.656831091849735,
 'roberta_pos': 0.057798881402853254}

In [35]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

Scores for first 100 reviews

In [38]:
print(accuracy_score(data['verdict'], data['predicted']))
print(recall_score(data['verdict'], data['predicted']))
print(precision_score(data['verdict'], data['predicted']))
print(f1_score(data['verdict'], data['predicted']))

0.87
0.7391304347826086
0.9714285714285714
0.8395061728395061


For the whole dataset(first 10000)

In [47]:
data_th = df[:10000]

In [48]:
data_th['predicted'] = data_th['merged'].apply(get_verdict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_th['predicted'] = data_th['merged'].apply(get_verdict)


In [49]:
data_th

Unnamed: 0,verdict,merged,predicted
0,2,Stuning even for the non-gamer This sound trac...,2
1,2,The best soundtrack ever to anything. I'm read...,2
2,2,Amazing! This soundtrack is my favorite music ...,2
3,2,Excellent Soundtrack I truly like this soundtr...,2
4,2,"Remember, Pull Your Jaw Off The Floor After He...",2
...,...,...,...
9995,2,A revelation of life in small town America in ...,2
9996,2,Great biography of a very interesting journali...,2
9997,1,Interesting Subject; Poor Presentation You'd b...,2
9998,1,Don't buy The box looked used and it is obviou...,1


In [50]:
print(accuracy_score(data_th['verdict'], data_th['predicted']))
print(recall_score(data_th['verdict'], data_th['predicted']))
print(precision_score(data_th['verdict'], data_th['predicted']))
print(f1_score(data_th['verdict'], data_th['predicted']))

0.8564
0.7496566607808515
0.9598090931926652
0.8418153778365278


this roberta model has good precision but poor accuracy over this particular datasaet, also we have assigned the neutral verdicts from the model as positive in this dataset

In [56]:
def get_true_verdict(example) -> int:
    if polarity_scores(example)['roberta_neg'] >= 0.5:
        return int(1)
    elif polarity_scores(example)['roberta_pos'] >= 0.5:
        return int(2)
    

In [64]:
data_th['predicted'] = data_th['merged'].apply(get_true_verdict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_th['predicted'] = data_th['merged'].apply(get_true_verdict)


In [71]:
data_th.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   verdict    10000 non-null  int64  
 1   merged     10000 non-null  object 
 2   predicted  8450 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 312.5+ KB


In [66]:
data = data.dropna(subset=['predicted'], axis=0)

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85 entries, 0 to 98
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   verdict    85 non-null     int64  
 1   merged     85 non-null     object 
 2   predicted  85 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.7+ KB


In [68]:
print(accuracy_score(data['verdict'], data['predicted']))
print(recall_score(data['verdict'], data['predicted']))
print(precision_score(data['verdict'], data['predicted']))
print(f1_score(data['verdict'], data['predicted']))

0.9411764705882353
0.8947368421052632
0.9714285714285714
0.9315068493150684


In [72]:
data_th = data_th.dropna(subset=['predicted'], axis=0)

In [73]:
print(accuracy_score(data_th['verdict'], data_th['predicted']))
print(recall_score(data_th['verdict'], data_th['predicted']))
print(precision_score(data_th['verdict'], data_th['predicted']))
print(f1_score(data_th['verdict'], data_th['predicted']))

0.9416568047337278
0.9198363023591719
0.9598090931926652
0.9393976644130301


Now the accuracy and precision scores are much better and acceptable, we can use this Roberta model, as it is
- finetuning this model with the given data will cause it to perform poorer

In [74]:
from sklearn.metrics import confusion_matrix

In [78]:
confusion_matrix(data_th['verdict'], data_th['predicted'])

array([[3821,  333],
       [ 160, 4136]], dtype=int64)