In [1]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob

# Load dataset
file_path = r"C:\Users\HP\Downloads\archive (6)\Sentiment_Stock_data.csv"
data = pd.read_csv(file_path)

# Inspect column names
print("Column Names:", data.columns)

Column Names: Index(['Unnamed: 0', 'Sentiment', 'Sentence'], dtype='object')


In [3]:
# Step 1: Data Preprocessing
# Drop rows with missing values in 'Sentence' and 'Sentiment'
data.dropna(subset=['Sentence', 'Sentiment'], inplace=True)

# Clean the text data
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    text = text.lower().strip()  # Convert to lowercase and strip spaces
    return text

data['Cleaned_Text'] = data['Sentence'].apply(clean_text)

# Step 2: Sentiment Analysis (optional, if not already done)
# Map Sentiment column to ensure binary labels (0 or 1)
data['Sentiment'] = data['Sentiment'].astype(int)
# Step 3: Model Development
# Feature extraction
X = data['Cleaned_Text']
y = data['Sentiment']

# Convert text to feature vectors using TF-IDF
vectorizer = CountVectorizer(stop_words='english')
X_counts = vectorizer.fit_transform(X)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.5327356321839081
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.44      0.48     10630
           1       0.54      0.62      0.58     11120

    accuracy                           0.53     21750
   macro avg       0.53      0.53      0.53     21750
weighted avg       0.53      0.53      0.53     21750



In [4]:
# Import necessary libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Step 5: Predictions and Evaluation
y_pred = xgb_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.5376091954022989
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.12      0.20     10630
           1       0.53      0.93      0.67     11120

    accuracy                           0.54     21750
   macro avg       0.58      0.53      0.44     21750
weighted avg       0.58      0.54      0.44     21750



In [17]:
!pip install transformers
!pip install datasets
!pip uninstall pyarrow -y
!pip install pyarrow --force-reinstall
!pip install --upgrade pandas datasets pyarrow


Found existing installation: pyarrow 18.1.0
Uninstalling pyarrow-18.1.0:
  Successfully uninstalled pyarrow-18.1.0
Collecting pyarrow
  Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl (25.1 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-18.1.0
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.5 MB 3.2 MB/s eta 0:00:04
   ---- ----------------------------------- 1.2/11.5 MB 15.0 MB/s eta 0:00:01
   ----- ---------------------------------- 1.4/11.5 MB 11.5 MB/s eta 0:00:01
   ------ --------------------------------- 1.9/11.5 MB 10.9 MB/s eta 0:00:01
   ------- -------------------------------- 2.0/11.5 MB 9.9 MB/s eta 0:00:01
   -------- ------------------------------- 

In [20]:
!pip install transformers
!pip install datasets
!pip uninstall pyarrow -y
!pip install pyarrow --force-reinstall
!pip install --upgrade pandas datasets pyarrow
import pyarrow as pa
import pyarrow.parquet as pq
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from datasets import load_metric
import torch

# Load and preprocess dataset
data['label'] = data['Sentiment'].astype(int)
dataset = Dataset.from_pandas(data[['Cleaned_Text', 'label']])

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples['Cleaned_Text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Load accuracy metric
metric = load_metric("accuracy")

# Custom compute_metrics function for accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
evaluation_results = trainer.evaluate()

# Print accuracy
print("Evaluation Results:", evaluation_results)
print("Accuracy:", evaluation_results['eval_accuracy'])


Found existing installation: pyarrow 18.1.0
Uninstalling pyarrow-18.1.0:
  Successfully uninstalled pyarrow-18.1.0
Collecting pyarrow
  Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl (25.1 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-18.1.0


ImportError: The pyarrow installation is not built with support for the Parquet file format (DLL load failed while importing _parquet: The specified procedure could not be found.)