In [None]:
!pip install transformers

In [1]:
import pandas as pd 
import torch
df = pd.read_csv("beach_review_binary.csv")

In [2]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [3]:
df['sentiment'].value_counts()

positive    137
negative    108
Name: sentiment, dtype: int64

In [4]:
df['review'] = df['review'].apply(str)

In [5]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
print(type(model))
print(model)

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Lin

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

inputs = [
  tokenizer(sentence, padding=True, max_length = 512, truncation=True, return_tensors='pt')
  for sentence in df['review']
]

In [9]:
model = AutoModel.from_pretrained("distilbert-base-uncased")

features = [
           mean_pooling(model(**input), input['attention_mask'])[0].detach().numpy()
           for input in inputs
]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
len(features)

245

In [11]:
labels = df['sentiment']

In [12]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [15]:
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [16]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7837837837837838


In [17]:
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": [1, 2]
             }

In [18]:
adab = AdaBoostClassifier()
ada_p_dist={'learning_rate':[0.25,0.5,0.75,1.],
            'n_estimators':[100,250,500,650],
            }

In [19]:
from time import time
def hypertuning_rscv(adab, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(adab, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=9)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    start = time()
    rdmsearch.fit(X,y)
    print('hyper-tuning time : %d seconds' % (time()-start))
    start = 0
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [20]:
from sklearn.model_selection import RandomizedSearchCV
ada_parameters, ada_ht_score = hypertuning_rscv(adab, ada_p_dist, 10, X_train, y_train)
print(ada_parameters)
print('Hyper-tuned model score :')
print(ada_ht_score*100)     

hyper-tuning time : 45 seconds
{'n_estimators': 250, 'learning_rate': 0.25}
Hyper-tuned model score :
84.7953216374269


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [22]:
crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)

In [23]:
ada=AdaBoostClassifier()
search_grid={'n_estimators':[500,1000,2000],'learning_rate':[.001,0.01,.1]}
search=GridSearchCV(estimator=ada,param_grid=search_grid,scoring='accuracy',n_jobs=1,cv=crossvalidation)

In [24]:
search.fit(X_train,y_train)
search.best_params_

In [25]:
import numpy as np
from sklearn.model_selection import cross_val_score
ada = AdaBoostClassifier(n_estimators=1000,learning_rate=1)
ada.fit(X_train,y_train)


AdaBoostClassifier(learning_rate=1, n_estimators=1000)

In [None]:
import pickle
pickle.dump(ada,open("model.pkl","wb"))