----------------------------
#### Stanford Sentiment Treebank 2 (SST-2) dataset
---------------------------

In [1]:
from datasets import load_dataset

In [4]:
# Load the SST-2 dataset
dataset = load_dataset("glue", "sst2", cache_dir= r'D:\AI-DATASETS\07-Hugging-Face-Data')

In [5]:
# Access splits
train_data      = dataset['train']
validation_data = dataset['validation']
test_data       = dataset['test']

In [6]:
# Get the number of samples in each split
num_train_samples      = len(dataset['train'])
num_validation_samples = len(dataset['validation'])
num_test_samples       = len(dataset['test'])

# Display the counts
print(f"Number of training samples: {num_train_samples}")
print(f"Number of validation samples: {num_validation_samples}")
print(f"Number of test samples: {num_test_samples}")

Number of training samples: 67349
Number of validation samples: 872
Number of test samples: 1821


In [7]:
dataset['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

- Label **1**: Positive Sentiment
    - This indicates that the sentence expresses a positive or favorable sentiment about the subject.

- Label **0**: Negative Sentiment
    - This indicates that the sentence expresses a negative or unfavorable sentiment about the subject.

In [9]:
import pandas as pd

In [12]:
# Set display options for Pandas
pd.set_option('display.max_colwidth', None)  # No truncation of column content
pd.set_option('display.width', None)  # No truncation of DataFrame display width

In [13]:
# Convert each split into Pandas DataFrame
train_df = pd.DataFrame(dataset['train'])
valid_df = pd.DataFrame(dataset['validation'])
test_df  = pd.DataFrame(dataset['test'])

In [18]:
train_df.sample(5)

Unnamed: 0,sentence,label,idx
39230,"for a guy who has waited three years with breathless anticipation for a new hal hartley movie to pore over , no such thing is a big letdown .",0,39230
36759,"jaw-dropping action sequences , striking villains , a gorgeous color palette",1,36759
50683,` fatal script error,0,50683
6143,just did n't care,0,6143
61578,accomplished actress,1,61578


#### Hands-on Activity
- Goal: Classify sentences using a pre-trained transformer model fine-tuned on SST-2.

In [19]:
from transformers import pipeline

**textattack/bert-base-uncased-SST-2**

The model being used is `textattack/bert-base-uncased-SST-2`, which is a BERT-based model fine-tuned on the SST-2 dataset for sentiment analysis. The model is designed to output either a positive or negative sentiment classification.


In [20]:
# Sentiment analysis pipeline
classifier = pipeline("sentiment-analysis", model="textattack/bert-base-uncased-SST-2")

config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [21]:
# Test sentences
sentences = ["The movie was fantastic!", "I hated every moment of it."]
results   = classifier(sentences)

In [22]:
print(results)

[{'label': 'LABEL_1', 'score': 0.9996718168258667}, {'label': 'LABEL_0', 'score': 0.9973737001419067}]


In [49]:
valid_df.sample(5)

Unnamed: 0,sentence,label,idx
249,huston nails both the glad-handing and the choking sense of hollow despair .,1,249
734,this is n't even madonna 's swept away .,0,734
834,collateral damage finally delivers the goods for schwarzenegger fans .,1,834
376,to call the other side of heaven `` appalling '' would be to underestimate just how dangerous entertainments like it can be .,0,376
571,"passable entertainment , but it 's the kind of motion picture that wo n't make much of a splash when it 's released , and will not be remembered long afterwards .",0,571


**Use valid_df** - to test the model

In [53]:
texts       = valid_df['sentence'].tolist()  # Extract text from 'sentence' column
true_labels = valid_df['label'].tolist()  # Extract true labels

In [54]:
# Predict sentiment for the validation dataset
predictions = classifier(texts)

In [56]:
predictions[:10]

[{'label': 'LABEL_1', 'score': 0.9997661709785461},
 {'label': 'LABEL_0', 'score': 0.9856705069541931},
 {'label': 'LABEL_1', 'score': 0.9995738863945007},
 {'label': 'LABEL_1', 'score': 0.996027946472168},
 {'label': 'LABEL_0', 'score': 0.9979519248008728},
 {'label': 'LABEL_1', 'score': 0.9996365308761597},
 {'label': 'LABEL_0', 'score': 0.9960685968399048},
 {'label': 'LABEL_0', 'score': 0.9940721392631531},
 {'label': 'LABEL_1', 'score': 0.9996395111083984},
 {'label': 'LABEL_0', 'score': 0.9970386028289795}]

In [57]:
# Convert predictions to numerical labels (0 for negative, 1 for positive)
predicted_labels = [1 if pred['label'] == 'LABEL_1' else 0 for pred in predictions]

In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [63]:
# Evaluate the model's performance
accuracy  = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall    = recall_score(true_labels, predicted_labels)
f1        = f1_score(true_labels, predicted_labels)

# Display the evaluation scores
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
confusion_matrix(true_labels, predicted_labels)

Accuracy: 92.43%
Precision: 91.81%
Recall: 93.47%
F1-Score: 92.63%


array([[391,  37],
       [ 29, 415]], dtype=int64)

**some other BERT models**

    "distilbert-base-uncased-finetuned-sst-2-english",
    "roberta-large-mnli",
    "albert-base-v2"

In [110]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'         # works
#model_name = 'textattack/bert-large-uncased-SST-2'
#model_name = 'textattack/albert-base-v2-SST-2'     #- private repo
#model_name = 'roberta-large-SST-2'    #- private repo
#model_name =  'google/electra-base-discriminator'   #- poor results
#model_name =  'xlm-roberta-base-SST-2' #- a private repo

In [111]:
token = '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'

In [112]:
# Load the model pipeline
classifier = pipeline("sentiment-analysis", 
                      model     = model_name,
                      token     = token
                      #cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data'
                     )

In [113]:
# Make predictions on the validation data
predictions = classifier(valid_df['sentence'].to_list())

In [114]:
predictions[:5]

[{'label': 'POSITIVE', 'score': 0.9998838901519775},
 {'label': 'NEGATIVE', 'score': 0.998969554901123},
 {'label': 'POSITIVE', 'score': 0.9996399879455566},
 {'label': 'POSITIVE', 'score': 0.9996993541717529},
 {'label': 'NEGATIVE', 'score': 0.9996892213821411}]

In [115]:
# Convert predictions to binary labels (0 for negative, 1 for positive)
predicted_labels = [1 if pred['label'] == 'LABEL_1' or pred['label'] == 'POSITIVE' else 0 for pred in predictions]
true_labels = valid_df['label'].tolist()

In [116]:
# Evaluate the model's performance
accuracy  = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall    = recall_score(true_labels, predicted_labels)
f1        = f1_score(true_labels, predicted_labels)

# Display the evaluation scores
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
confusion_matrix(true_labels, predicted_labels)

Accuracy: 91.06%
Precision: 89.78%
Recall: 93.02%
F1-Score: 91.37%


array([[381,  47],
       [ 31, 413]], dtype=int64)

#### Try a Simple Logistic regression

In [24]:
# Convert splits to Pandas DataFrames
train_data = dataset['train'].to_pandas()
valid_data = dataset['validation'].to_pandas()
test_data  = dataset['test'].to_pandas()

# Rename columns for clarity
train_data.columns = ['sentence', 'label', 'idx']
valid_data.columns = ['sentence', 'label', 'idx']
test_data.columns  = ['sentence', 'label', 'idx']

In [34]:
train_data.sample(5)

Unnamed: 0,sentence,label,idx
34953,be a new mexican cinema a-bornin ',1,34953
37604,elegantly produced and expressively performed,1,37604
7233,misfire that even tunney ca n't save .,0,7233
19990,speaks for itself,1,19990
48222,i have given it a one-star rating,0,48222


In [45]:
train_data.label.value_counts()

1    37569
0    29780
Name: label, dtype: int64

In [35]:
valid_data.sample(5)

Unnamed: 0,sentence,label,idx
43,holm ... embodies the character with an effortlessly regal charisma .,1,43
784,very psychoanalytical -- provocatively so -- and also refreshingly literary .,1,784
200,"the format gets used best ... to capture the dizzying heights achieved by motocross and bmx riders , whose balletic hotdogging occasionally ends in bone-crushing screwups .",1,200
100,"neither parker nor donovan is a typical romantic lead , but they bring a fresh , quirky charm to the formula .",1,100
50,"it feels like an after-school special gussied up with some fancy special effects , and watching its rote plot points connect is about as exciting as gazing at an egg timer for 93 minutes .",0,50


In [46]:
valid_data.label.value_counts()

1    444
0    428
Name: label, dtype: int64

In [43]:
test_data.sample(5)

Unnamed: 0,sentence,label,idx
948,"i wish i could say `` thank god it 's friday '' , but the truth of the matter is i was glad when it was over .",-1,948
1232,"a compelling , moving film that respects its audience and its source material .",-1,1232
1752,its characters are thinner than cardboard -- or even comic-book paper .,-1,1752
1721,"none of these characters resembles anyone you 've ever met in real life , unless you happen to know annoyingly self-involved people who speak in glib sentences that could have only come from the pen of a screenwriter .",-1,1721
1587,"what 's at stake in this film is nothing more than an obsolete , if irritating , notion of class .",-1,1587


In [47]:
test_data.label.value_counts()

-1    1821
Name: label, dtype: int64

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
                             #max_features = 5000, 
                             stop_words   = 'english'
                            )

In [27]:
# Transform the sentences
X_train = vectorizer.fit_transform(train_data['sentence'])
X_valid = vectorizer.transform(valid_data['sentence'])
X_test  = vectorizer.transform(test_data['sentence'])

In [28]:
# Target labels
y_train = train_data['label']
y_valid = valid_data['label']
y_test  = test_data['label']

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [30]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [31]:
# Predict on validation and test sets
y_valid_pred = model.predict(X_valid)
y_test_pred  = model.predict(X_test)

In [32]:
# Calculate accuracy
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
test_accuracy  = accuracy_score(y_test, y_test_pred)

print(f"Validation Accuracy: {valid_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Validation Accuracy: 80.05%
Test Accuracy: 0.00%
