### Install necessary packages

In [1]:
# Install simpletransformers
# May require a runtime restart

!pip install simpletransformers



In [2]:
!pip install Unidecode



In [3]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Load dataset

In [4]:
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1b8MAiN-xBdk6scM-DnufkuijDZivZJqM")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  2400 non-null   object
 1   Polarity  2400 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 37.6+ KB


In [5]:
df.head()

Unnamed: 0,Sentence,Polarity
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df['Polarity'].value_counts()

0    1213
1    1187
Name: Polarity, dtype: int64

In [7]:
df.shape

(2400, 2)

### Preprocessing


In [8]:
import unidecode

def preprocess(x):
  x = unidecode.unidecode(x)

  return x

df['Sentence'] = df['Sentence'].apply(preprocess) # Apply preprocessing
df.drop_duplicates('Sentence', inplace=True)  # Drop duplicates

In [9]:
df.shape

(2382, 2)

### Train-test split

In [10]:
from sklearn.model_selection import train_test_split

X = df[['Sentence', 'Polarity']]

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_train['Polarity'].value_counts())

print(X_test.shape)
print(X_test['Polarity'].value_counts())

(1905, 2)
0    968
1    937
Name: Polarity, dtype: int64
(477, 2)
1    239
0    238
Name: Polarity, dtype: int64


### Modelling

In [11]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import sklearn

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [12]:
# Model configuration
model_args = ClassificationArgs(num_train_epochs=30, sliding_window=True, overwrite_output_dir=True, save_model_every_epoch=False, max_seq_length=400)

# Create a ClassificationModel
model = ClassificationModel("roberta", "roberta-base", args=model_args, use_cuda=True)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [13]:
# Train the model
# Note: This is being trained on entire dataset

model.train_model(X, eval_df=X_test, f1 = sklearn.metrics.f1_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/2382 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: 2382 features created from 2382 samples.


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Running Epoch 0 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 1 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 2 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 4 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 5 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 6 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 7 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 8 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 9 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 10 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 11 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 12 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 13 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 14 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 15 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 16 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 17 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 18 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 19 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 20 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 21 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 22 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 23 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 24 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 25 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 26 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 27 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 28 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

Running Epoch 29 of 30:   0%|          | 0/298 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(8940, 0.051835476116787176)

In [14]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(X_test, f1 = sklearn.metrics.f1_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/477 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: 477 features created from 477 samples.


Running Evaluation:   0%|          | 0/60 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 1.0, 'tp': 239, 'tn': 238, 'fp': 0, 'fn': 0, 'f1': 1.0, 'eval_loss': 3.347141841913981e-06}


### Test the model

In [15]:
# Load test data
df_test = pd.read_csv("https://drive.google.com/uc?export=download&id=1taoTluPBUMt9JkKAnlqDTrU49DJFpJGW")

In [16]:
df_test.head()

Unnamed: 0,Sentence,Polarity
0,A good commentary of today's love and undoubte...,1
1,For people who are first timers in film making...,1
2,"It was very popular when I was in the cinema, ...",1
3,It's a feel-good film and that's how I felt wh...,1
4,It has northern humour and positive about the ...,1


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  600 non-null    object
 1   Polarity  600 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.5+ KB


In [18]:
pred_test, res = model.predict(df_test['Sentence'].tolist(),multi_label=False)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/600 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: 600 features created from 600 samples.


  0%|          | 0/75 [00:00<?, ?it/s]

In [None]:
my_submission = pd.DataFrame({'predicted': pred_test})
my_submission.head()

# This command will save the file to the local cloud instance; it will be deleted
# as soon as this Notebooks session ends.
#my_submission.to_csv('my_submission.csv', index=False)

In [20]:
my_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   predicted  600 non-null    int64
dtypes: int64(1)
memory usage: 4.8 KB


In [21]:
# Download submission file
from google.colab import files
#files.download('my_submission.csv')

### Inspect wrong predictions
* Will need to use train-test split, not entire dataset, when training to get proper results

In [22]:
wrong_predictions

[]

In [23]:
predictions, raw_outputs = model.predict(['I really hated this movie. NOT!.'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: 1 features created from 1 samples.


  0%|          | 0/1 [00:00<?, ?it/s]

[0]
[array([[ 5.55859375, -6.06640625]])]


In [24]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


print("F1 Score   = {:.3f}".format(f1_score(df_test['Polarity'], pred_test, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(df_test['Polarity'], pred_test)))
print("AUC      = {:.3f}".format(roc_auc_score(df_test['Polarity'], pred_test)))
print("\nConfusion Matrix:")

print(confusion_matrix(y_true = df_test['Polarity'], y_pred = pred_test))

print("\n")

print("\nClassification Report:")

#class_names = [str(x) for x in search.best_estimator_.classes_]
print(classification_report(y_true = df_test['Polarity'], y_pred = pred_test))

F1 Score   = 0.942
Accuracy   = 0.942
AUC      = 0.942

Confusion Matrix:
[[273  14]
 [ 21 292]]



Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       287
           1       0.95      0.93      0.94       313

    accuracy                           0.94       600
   macro avg       0.94      0.94      0.94       600
weighted avg       0.94      0.94      0.94       600

F1 Score   = 0.942
Accuracy   = 0.942
AUC      = 0.942

Confusion Matrix:
[[273  14]
 [ 21 292]]



Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       287
           1       0.95      0.93      0.94       313

    accuracy                           0.94       600
   macro avg       0.94      0.94      0.94       600
weighted avg       0.94      0.94      0.94       600



In [28]:
print(wrong_predictions)

[]


In [29]:
incorrect_predictions = df_test[df_test['Polarity'] != pred_test]
incorrect_predictions.shape

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


(35, 2)

In [30]:
incorrect_predictions.head(15)

Unnamed: 0,Sentence,Polarity
80,The acting by the whole cast could be put on a...,0
94,I really hope the team behind this movie makes...,1
99,Later I found myself lost in the power of the ...,1
124,Full of unconvincing cardboard characters it i...,0
128,Whatever prompted such a documentary is beyond...,0
176,Lewis Black's considerable talent is wasted he...,0
190,I can't believe there's even a sequel to this!,0
214,Omit watching this.,0
219,Don't miss it.,1
223,It's as continuously beautiful to look at as a...,1
