# Custom Trained Model Sentiment Analysis

In [65]:
import os

import sys
print(sys.executable)

c:\users\00mrk\appdata\local\programs\python\python38\python.exe


In [66]:
# uncomment/comment down below line to install/uninstall hugging-face transformers

#!pip install transformers
#!pip install sklearn
#!pip install torch

### Importing necessary Libraries.

In [67]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb # pytorch-transformers by huggingface
import warnings
import time
ppb.logging.set_verbosity_error()
warnings.filterwarnings('ignore')

### read the data from this dataset in two formats : CSV and TSV.

In [102]:
path = 'stanford-sentiment-treebank-v2-sst2/datasets/'

df = pd.read_csv(path + 'tsv-format/train.tsv', delimiter='\t')

#print(df)

# to read via CSV files...
# df = pd.read_csv(path + 'csv-format/train.csv')

# shape of dataset.
df.shape

(6920, 2)

#### For performance reasons, we'll only use 2,000 sentences from the dataset

In [104]:
batch_1 = df[:5000]
pd.set_option('display.max_colwidth', None)
batch_1.head(100000)


Unnamed: 0,Reviews,Ratings
0,"a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films",1
1,apparently reassembled from the cutting room floor of any given daytime soap,0
2,"they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science fiction elements of bug eyed monsters and futuristic women in skimpy clothes",0
3,"this is a visually stunning rumination on love , memory , history and the war between art and commerce",1
4,jonathan parker 's bartleby should have been the be all end all of the modern office anomie films,1
...,...,...
6915,"painful , horrifying and oppressively tragic , this film should not be missed",1
6916,"take care is nicely performed by a quintet of actresses , but nonetheless it drags during its 112 minute length",0
6917,"the script covers huge , heavy topics in a bland , surfacey way that does n't offer any insight into why , for instance , good things happen to bad people",0
6918,a seriously bad film with seriously warped logic by writer director kurt wimmer at the screenplay level,0


In [70]:
batch_1['Ratings'].value_counts()

1    1041
0     959
Name: Ratings, dtype: int64

### Let's now load a pre-trained BERT model.

In [71]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

#### Right now, the variable model holds a pretrained distilBERT model -- a version of BERT that is smaller, but much faster and requiring a lot less memory.

In [72]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

## Tokenization
Our first step is to tokenize the sentences -- break them up into word and subwords in the format BERT is comfortable with.

In [73]:

tokenized = batch_1['Reviews'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized.shape

(2000,)

## Padding

In [74]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(2000, 59)

## Masking

In [75]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

## DEEP LEARNING

let's train the model

The model() function runs sentences through BERT.

In [76]:
print(time.ctime())

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
    

print(time.ctime())

Sun May  1 00:46:41 2022
Sun May  1 00:48:21 2022


In [77]:
features = last_hidden_states[0][:,0,:].numpy()

In [78]:
labels = batch_1['Ratings']

In [79]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

## train the Logistic Regression model on the training data

In [81]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

## test the Logistic Regression model on the test data

In [82]:
lr_clf.score(test_features, test_labels)

0.828

In [100]:
path = 'stanford-sentiment-treebank-v2-sst2/datasets/'

df = pd.read_csv(path + 'tsv-format/test2.tsv', delimiter='\t')

batch_1 = df
pd.set_option('display.max_colwidth', None)
batch_1.head(100000)

Unnamed: 0,Reviews,Ratings,Prediction
0,"no movement , no yuks , not much of anything",0,0
1,"a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch it up like rancid cr me br l e",0,0
2,"gangs of new york is an unapologetic mess , whose only saving grace is that it ends by blowing just about everything up",0,1
3,"we never really feel involved with the story , as all of its ideas remain just that abstract ideas",0,0
4,this is one of polanski 's best films,1,1
...,...,...,...
1816,"an often deadly boring , strange reading of a classic whose witty dialogue is treated with a baffling casual approach",0,0
1817,"the problem with concept films is that if the concept is a poor one , there 's no saving the movie",0,0
1818,"safe conduct , however ambitious and well intentioned , fails to hit the entertainment bull 's eye",0,0
1819,"a film made with as little wit , interest , and professionalism as artistically possible for a slummy hollywood caper flick",0,0


In [90]:
#print(test_features[0])
lr_clf.predict(test_features)

array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,

In [84]:
print(batch_1)

                                                       Reviews  Ratings
0                                                 this is shit        0
1                                              this is garbage        0
2                                  this is pretty good garbage        0
3                                      not bad, not bad at all        1
4                             pretty stupid, but in a good way        0
5                                                  really nice        1
6                                                  bilionaires        0
7                                                    oligarchs        0
8                                         it is a simple movie        0
9      script sounds like it was written by a bunch of monkeys        0
10  my eyes were bleeding after I tore them out after watching        0
11                                              waste of a day        0
12                                                   addicting  

In [86]:
print(lr_clf.predict(new_features))
print(" ")
print(lr_clf.predict_proba(new_features))
print(" ")
print(lr_clf.score(new_features, new_labels))
print(" ")

[0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0]
 
[[0.98323219 0.01676781]
 [0.99819389 0.00180611]
 [0.97710125 0.02289875]
 [0.84562289 0.15437711]
 [0.69681505 0.30318495]
 [0.05333131 0.94666869]
 [0.29092542 0.70907458]
 [0.76092639 0.23907361]
 [0.44161449 0.55838551]
 [0.99012889 0.00987111]
 [0.90287425 0.09712575]
 [0.99391506 0.00608494]
 [0.98073788 0.01926212]
 [0.3519826  0.6480174 ]
 [0.56385718 0.43614282]
 [0.07587182 0.92412818]
 [0.7461826  0.2538174 ]]
 
0.7058823529411765
 
