# Distillbert Trained Sentiment Analysis

In [1]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import sys
print(sys.executable)

/usr/bin/python3


In [2]:
# uncomment/comment down below line to install/uninstall hugging-face transformers

!pip install transformers
!pip install sklearn
!pip install torch

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 15.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

### Importing necessary Libraries.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb # pytorch-transformers by huggingface
import warnings
import time
ppb.logging.set_verbosity_error()
warnings.filterwarnings('ignore')

### read the data from this dataset in two formats : CSV and TSV.

In [2]:
#path = 'stanford-sentiment-treebank-v2-sst2/datasets/'

df = pd.read_csv('train.tsv', delimiter='\t')

#print(df)

# to read via CSV files...
# df = pd.read_csv(path + 'csv-format/train.csv')

# shape of dataset.
df.shape

(6920, 2)

#### For performance reasons, we'll only use 2,000 sentences from the dataset

In [3]:
batch_1 = df
pd.set_option('display.max_colwidth', None)
batch_1.head(10)


Unnamed: 0,Reviews,Ratings
0,"a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films",1
1,apparently reassembled from the cutting room floor of any given daytime soap,0
2,"they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science fiction elements of bug eyed monsters and futuristic women in skimpy clothes",0
3,"this is a visually stunning rumination on love , memory , history and the war between art and commerce",1
4,jonathan parker 's bartleby should have been the be all end all of the modern office anomie films,1
5,campanella gets the tone just right funny in the middle of sad in the middle of hopeful,1
6,a fan film that for the uninitiated plays better on video with the sound turned down,0
7,"b art and berling are both superb , while huppert is magnificent",1
8,"a little less extreme than in the past , with longer exposition sequences between them , and with fewer gags to break the tedium",0
9,the film is strictly routine,0


In [4]:
batch_1['Ratings'].value_counts()

1    3610
0    3310
Name: Ratings, dtype: int64

### Let's now load a pre-trained BERT model.

In [5]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

#### Right now, the variable model holds a pretrained distilBERT model -- a version of BERT that is smaller, but much faster and requiring a lot less memory.

In [6]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

## Tokenization
Our first step is to tokenize the sentences -- break them up into word and subwords in the format BERT is comfortable with.

In [7]:

tokenized = batch_1['Reviews'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized.shape

(6920,)

## Padding

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(6920, 67)

## Masking

In [9]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6920, 67)

## DEEP LEARNING

let's train the model

The model() function runs sentences through BERT.

In [None]:
print(time.ctime())

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
    

print(time.ctime())

Sun May  1 04:28:45 2022


In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1['Ratings']

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

## train the Logistic Regression model on the training data

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

## test the Logistic Regression model on the test data

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
print(new_features)
print(new_labels)

In [None]:
print(lr_clf.predict(new_features))
print(" ")
print(lr_clf.predict_proba(new_features))
print(" ")
print(lr_clf.score(new_features, new_labels))
print(" ")