In [1]:
!pip install transformers



In [2]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f.
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9598048329353333}]

In [3]:
classifier("i hate to hate and love to love and live with ppeace")

[{'label': 'POSITIVE', 'score': 0.9988020658493042}]

In [5]:
import pandas as pd
column_names = ['com_id', 'com_name', 'cus_target', 'cus_comment']
df = pd.read_csv('/content/twitter_training.csv', names=column_names, header=None)
df.head()

Unnamed: 0,com_id,com_name,cus_target,cus_comment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   com_id       74682 non-null  int64 
 1   com_name     74682 non-null  object
 2   cus_target   74682 non-null  object
 3   cus_comment  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [13]:
column_names = ['com_name', 'cus_target', 'cus_comment']
for i in column_names:
  df[i] = df[i].astype(str)
  df[i] = df[i].str.lower()
df.dropna(inplace=True)

In [None]:
df['cus_comment2'] = df['cus_comment'].apply(classifier)

comparing = df.groupby('cus_name')[['cus_comment2', 'cus_target']].agg(lambda x: x.value_counts().index[0])

In [None]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def processing(text):
    tokens = word_tokenize(text.lower())
    # Filter stops and non-alphabetic (punctuation)
    return [w for w in tokens if w not in stop_words and w.isalpha()]

# --- CRITICAL STEP: Create a Vocabulary ---
all_tokens = []
for comment in df['cus_comment']:
    all_tokens.extend(processing(comment))

vocab = {word: i+1 for i, word in enumerate(set(all_tokens))} # i+1 to save 0 for padding
vocab['<PAD>'] = 0
VOCAB_SIZE = len(vocab)

# Map target labels to numerical IDs
# First, we need  get all unique target labels
unique_targets = df['cus_target'].unique().tolist()
target_to_id = {label: i for i, label in enumerate(unique_targets)}
id_to_target = {i: label for label, i in target_to_id.items()}

# Transform the 'cus_target' column to numerical IDs
df['cus_target_id'] = df['cus_target'].map(target_to_id)

class NeuralNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        # 1. Embedding layer turns Word IDs into dense vectors (math space)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 2. LSTM processes the sequence
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # 3. Linear layer maps hidden state to your classes
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # text shape: [batch_size, sequence_length]
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)

        # We only care about the last hidden state (the summary of the sentence)
        last_hidden = hidden[-1]
        return self.fc(last_hidden)

# Initialize Model
# vocab_size, embed_dim, hidden_dim, output_dim (number of unique target classes)
model = NeuralNetwork(VOCAB_SIZE, 100, 256, len(unique_targets))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    for i in range(len(df)):
        # 1. Convert text to tokens
        tokens = processing(df['cus_comment'][i])

        # 2. Convert tokens to IDs using our vocab dictionary
        ids = [vocab[w] for w in tokens if w in vocab]
        if not ids: continue # Skip empty processed sentences

        # 3. Prepare Tensors
        inputs = torch.tensor([ids], dtype=torch.long) # Shape [1, seq_len]
        # Use the numerical target ID
        label = torch.tensor([df['cus_target_id'][i]], dtype=torch.long)

        # 4. Standard PyTorch Training Step
        optimizer.zero_grad()
        outputs = model(inputs) # Forward pass
        loss = criterion(outputs, label)
        loss.backward()         # Backpropagation
        optimizer.step()        # Update weights

        if i % 100 == 0:
            model.eval() # Set model to evaluation mode (turns off Dropout)
            with torch.no_grad():
                # Get a test sentence
                test_tokens = processing(df['cus_comment'][i])
                test_ids = [vocab[w] for w in test_tokens if w in vocab]

                if test_ids:
                    test_inputs = torch.tensor([test_ids], dtype=torch.long)
                    test_outputs = model(test_inputs)

                    # Convert raw scores (logits) to a prediction (0 or 1)
                    prediction = torch.argmax(test_outputs, dim=1).item()
                    # Get the actual original string label for printing
                    actual_id = df['cus_target_id'][i]
                    actual_label = id_to_target[actual_id]
                    predicted_label = id_to_target[prediction]

                    print(f"--- Check at Batch {i} ---")
                    print(f"Comment: {df['cus_comment'][i][:50]}...")
                    print(f"Predict: {predicted_label} | Actual: {actual_label}")

            model.train() # Set model back to training mode
            print(f"Epoch {epoch} | Batch {i} | Loss: {loss.item():.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


--- Check at Batch 0 ---
Comment: im getting on borderlands and i will murder you al...
Predict: positive | Actual: positive
Epoch 0 | Batch 0 | Loss: 1.4444
--- Check at Batch 100 ---
Comment: deep grounded almost looked pretty cool even despi...
Predict: negative | Actual: negative
Epoch 0 | Batch 100 | Loss: 1.1386
--- Check at Batch 200 ---
Comment: fuck it. pic.wikipedia.org / wav1bacr5j...
Predict: positive | Actual: negative
Epoch 0 | Batch 200 | Loss: 2.2771
--- Check at Batch 300 ---
Comment: i disagree. psycho krieg and the fantastic fusterc...
Predict: positive | Actual: positive
Epoch 0 | Batch 300 | Loss: 0.9696
--- Check at Batch 400 ---
Comment: as extremely amazing that as this is.... 3 it's be...
Predict: positive | Actual: positive
Epoch 0 | Batch 400 | Loss: 0.9133
--- Check at Batch 500 ---
Comment: gb, love, and tentacles is out now, and here are 5...
Predict: neutral | Actual: neutral
Epoch 0 | Batch 500 | Loss: 0.5232
--- Check at Batch 600 ---
Comment: well this