In [None]:
import pandas as pd

# Load the dataset
file_path = 'C:/Users/emdfa/Downloads/cryptonews.csv'
df = pd.read_csv(file_path)

# Inspect the first few rows
#print(df.head())

# Extract relevant columns
df = df[['title', 'text', 'sentiment']]

# Parse the 'sentiment' column to extract 'class' and 'polarity'
import ast

def parse_sentiment(sentiment_str):
    sentiment_dict = ast.literal_eval(sentiment_str)
    sentiment_class = sentiment_dict.get('class')
    polarity = sentiment_dict.get('polarity')
    return sentiment_class, polarity

df[['sentiment_class', 'polarity']] = df['sentiment'].apply(parse_sentiment).apply(pd.Series)

# Drop the original 'sentiment' column
df = df.drop(columns=['sentiment'])

# Map sentiment classes to numeric values (negative=0, neutral=1, positive=2)
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment_class'] = df['sentiment_class'].map(sentiment_mapping)

# Check the processed dataset
print(df.head())


In [None]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function to tokenize titles and texts
def preprocess(text, max_len=128):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize the text and title for each row
df['inputs'] = df['text'].apply(lambda x: preprocess(x))

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.head())


In [None]:
import torch
from transformers import BertForSequenceClassification, AdamW

# Load the BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create DataLoader for training data
def create_data_loader(df, batch_size=8):
    input_ids = torch.cat([x['input_ids'] for x in df['inputs']])
    attention_masks = torch.cat([x['attention_mask'] for x in df['inputs']])
    labels = torch.tensor(df['sentiment_class'].values)
    
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return DataLoader(dataset, batch_size=batch_size)

train_loader = create_data_loader(train_df)
test_loader = create_data_loader(test_df)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    
    total_loss = 0
    correct_predictions = 0
    
    for batch in train_loader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        # Zero out gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == b_labels)

    accuracy = correct_predictions.double() / len(train_df)
    print(f'Epoch {epoch + 1}, Loss: {total_loss}, Accuracy: {accuracy}')


In [None]:
from sklearn.metrics import classification_report

def evaluate(model, test_loader):
    model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
            outputs = model(
                input_ids=b_input_ids,
                attention_mask=b_attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())
    
    report = classification_report(all_labels, all_preds, target_names=['negative', 'neutral', 'positive'])
    print(report)

# Evaluate the model
evaluate(model, test_loader)


In [None]:
def predict_sentiment(model, text):
    model.eval()
    inputs = preprocess(text)
    with torch.no_grad():
        outputs = model(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device)
        )
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).cpu().item()
    
    polarity = ['negative', 'neutral', 'positive'][pred]
    return polarity

# Test on new headline
new_headline = "Bitcoin surges to new all-time high"
print(f"Sentiment: {predict_sentiment(model, new_headline)}")
