# Text Classification - Simple Transformers
Predict the rating of a review, given it's text.


In [None]:
pip install simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/6d/ee/99e2809fb311841376fe01f3524a912b1907d7b45d445f16ad27b4422c9f/simpletransformers-0.60.9-py3-none-any.whl (206kB)
[K     |████████████████████████████████| 215kB 17.3MB/s 
[?25hCollecting transformers>=4.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/2c/d8/5144b0712f7f82229a8da5983a8fbb8d30cec5fbd5f8d12ffe1854dcea67/transformers-4.4.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 31.2MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 50.8MB/s 
Collecting tensorboardx
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/imdb.small.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   score   10000 non-null  int64 
 2   rating  10000 non-null  object
 3   en      10000 non-null  object
dtypes: int64(2), object(2)
memory usage: 312.6+ KB


In [None]:
df.head()

Unnamed: 0,id,score,rating,en
0,5998,1,negative,The basic formula for the original series was...
1,4615,1,negative,I may not have the longest of attention-spans...
2,8429,3,negative,"""Disappointing"" is the best word I could thin..."
3,10453,10,positive,"While traveling by train through Europe, the ..."
4,6941,1,negative,"This movie is not only boring, it is also rea..."


In [None]:
# Transformers needs an integer, not a string
df['label'] = df['rating'].apply(lambda x: 0.0 if 'negative' in x else 1.0)

In [None]:
df.head()

Unnamed: 0,id,score,rating,en,label
0,5998,1,negative,The basic formula for the original series was...,0.0
1,4615,1,negative,I may not have the longest of attention-spans...,0.0
2,8429,3,negative,"""Disappointing"" is the best word I could thin...",0.0
3,10453,10,positive,"While traveling by train through Europe, the ...",1.0
4,6941,1,negative,"This movie is not only boring, it is also rea...",0.0


In [None]:
from sklearn.model_selection import train_test_split

X = df[['en', 'label']]

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_train['label'].value_counts())

print(X_test.shape)
print(X_test['label'].value_counts())

(8000, 2)
1.0    4076
0.0    3924
Name: label, dtype: int64
(2000, 2)
0.0    1016
1.0     984
Name: label, dtype: int64


In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=2)

# Create a ClassificationModel
model = ClassificationModel("roberta", "distilroberta-base", args=model_args, use_cuda=True)

INFO:filelock:Lock 140548408274832 acquired on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

INFO:filelock:Lock 140548408274832 released on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock
INFO:filelock:Lock 140548408369104 acquired on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock


Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

INFO:filelock:Lock 140548408369104 released on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 140548408122960 released on /root/.cache/huggingface/transformers/23e0f7484fc8a320856b168861166b48c2976bb4e0861602422e1b0c3fe5bf61.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO:filelock:Lock 140548408122640 acquired on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 140548408122640 released on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:filelock:Lock 140548274090512 acquired on /root/.cache/huggingface/transformers/b6a9ca6504e67903474c3fdf82ba249882406e61c2176a9d4dc9c3691c663767.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

INFO:filelock:Lock 140548274090512 released on /root/.cache/huggingface/transformers/b6a9ca6504e67903474c3fdf82ba249882406e61c2176a9d4dc9c3691c663767.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


In [None]:
# Train the model
model.train_model(X_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_2_8000


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1000 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1000 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(2000, 0.36246093846682925)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(X_test)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_2000


Running Evaluation:   0%|          | 0/250 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7209184043215555, 'tp': 843, 'tn': 878, 'fp': 138, 'fn': 141, 'auroc': 0.9375190048652455, 'auprc': 0.9344826114771787, 'eval_loss': 0.5297744903899729}


In [None]:
result

{'auprc': 0.9344826114771787,
 'auroc': 0.9375190048652455,
 'eval_loss': 0.5297744903899729,
 'fn': 141,
 'fp': 138,
 'mcc': 0.7209184043215555,
 'tn': 878,
 'tp': 843}

In [None]:
model_outputs[1:20,]

array([[ 3.2578125 , -3.3125    ],
       [-2.59179688,  2.390625  ],
       [-2.07226562,  2.02929688],
       [-2.3515625 ,  2.203125  ],
       [-2.12304688,  2.14257812],
       [ 3.16015625, -3.26757812],
       [-2.32617188,  2.20117188],
       [ 2.10351562, -2.25195312],
       [ 1.88183594, -1.94921875],
       [-0.27441406,  0.15185547],
       [ 2.93945312, -3.07226562],
       [-2.40429688,  2.25195312],
       [ 0.54492188, -0.63183594],
       [ 3.390625  , -3.4296875 ],
       [ 3.171875  , -3.29882812],
       [-2.2421875 ,  2.12109375],
       [ 2.06640625, -2.22851562],
       [-0.97558594,  0.86669922],
       [ 3.34570312, -3.4140625 ]])

In [None]:
wrong_predictions

[<simpletransformers.classification.classification_utils.InputExample at 0x7fd37e7874d0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e7877d0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e787550>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e787a10>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e7800d0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e780890>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd37e7808d0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd3f1fa6ad0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd3f1f37950>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd3f1f374d0>,
 <simpletransformers.classification.classification_utils.InputExample at 0x7fd3f1f37a90>,
 <simpletr

In [None]:
wrong_predictions

In [None]:
predictions, raw_outputs = model.predict(['I really hated this movie. NOT!.'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[0]
[[ 2.96679688 -3.19140625]]


In [None]:
predictions, raw_outputs = model.predict(['This movie didn\t make sense'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[0]
[[ 2.79492188 -2.95703125]]


In [None]:
predictions, raw_outputs = model.predict(['The movie was doped'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[0]
[[ 3.09375    -3.11132812]]


In [None]:
predictions, raw_outputs = model.predict(['The theater was bad but the movie was good.'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[1]
[[-1.45800781  1.45800781]]


In [None]:
predictions, raw_outputs = model.predict(['The theater was good but the movie was bad.'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[0]
[[ 3.23242188 -3.2734375 ]]


In [None]:
predictions, raw_outputs = model.predict(['The movie got 3 of 10.'])
print(predictions)
print(raw_outputs)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_1


  0%|          | 0/1 [00:00<?, ?it/s]

[1]
[[-1.01171875  0.70166016]]
