# DEPENDENCIES

In [7]:
# if needed, install required packages
#   %pip install -r requirements.txt

In [8]:
import torch
import pandas as pd
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# LOAD DATA

In [9]:
df = pd.read_csv("all-data.csv", encoding="latin1", header=None)
df.columns = ["sentiment", "sentence"]   # assign column names

# keep only positive/negative
df = df[df["sentiment"] != "neutral"].reset_index(drop=True)

# MODEL

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# PREDICT AGAINST SENTIMENT

In [11]:
tqdm.pandas()

predicted_labels = []
predicted_probs = []

for sentence in tqdm(df["sentence"], desc="Running predictions"):

    encoded = tokenizer(
        str(sentence),
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        outputs = model(**encoded)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
        pred_id = int(logits.argmax().item())
        prediction_label = model.config.id2label[pred_id]

    predicted_labels.append(prediction_label)
    predicted_probs.append(probs)


# Add results to DF
df["predicted_label"] = predicted_labels
df["prediction_probs"] = predicted_probs

print("\nPredictions complete.\n")
print(df.head())

Running predictions:   0%|          | 0/1967 [00:00<?, ?it/s]

Running predictions: 100%|██████████| 1967/1967 [00:57<00:00, 34.02it/s]



Predictions complete.

  sentiment                                           sentence  \
0  negative  The international electronic industry company ...   
1  positive  With the new production plant the company woul...   
2  positive  According to the company 's updated strategy f...   
3  positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...   
4  positive  For the last quarter of 2010 , Componenta 's n...   

  predicted_label           prediction_probs  
0        NEGATIVE  [0.96732426, 0.032675773]  
1        POSITIVE  [0.0029164762, 0.9970835]  
2        POSITIVE     [0.13829306, 0.861707]  
3        POSITIVE    [0.4506162, 0.54938376]  
4        NEGATIVE  [0.99395835, 0.006041621]  


# MODEL STATISTICS

In [12]:
# convert to lowercase to match model labels (POSITIVE/NEGATIVE)
df["sentiment"] = df["sentiment"].str.lower()
df["predicted_label"] = df["predicted_label"].str.lower()

y_true = df["sentiment"]
y_pred = df["predicted_label"]

# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Precision, Recall, F1
precision = precision_score(y_true, y_pred, pos_label="positive")
recall = recall_score(y_true, y_pred, pos_label="positive")
f1 = f1_score(y_true, y_pred, pos_label="positive")

print("\n=== METRICS ===")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

conf_matrix = pd.crosstab(df["sentiment"], df["predicted_label"])
print("\nConfusion Matrix:\n", conf_matrix)


=== METRICS ===
Accuracy : 0.7361
Precision: 0.9839
Recall   : 0.6295
F1-score : 0.7678

Confusion Matrix:
 predicted_label  negative  positive
sentiment                          
negative              590        14
positive              505       858
