In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# Load datasets
train_df = pd.read_csv('/content/drive/MyDrive/train_BERT.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test_BERT.csv')

# Assuming the CSV files have columns 'text' for the input and 'label' for the labels
# You may need to adjust column names based on your CSV file structure

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True,
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare datasets
max_len = 512  # Max length of tokens, can be adjusted

train_dataset = CustomDataset(
    texts=train_df.combined_headlines.to_numpy(),
    labels=train_df.Label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)


test_dataset = CustomDataset(
    texts=test_df.combined_headlines.to_numpy(),
    labels=test_df.Label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
display(train_df)

Unnamed: 0,Date,Label,combined_headlines
0,2000-08-30,0,the best duel scene leader foreign control of ...
1,2000-08-31,0,titus hospital case lse hits back at jingoism ...
2,2000-09-01,1,only two fresh start schools improved their gc...
3,2000-09-05,1,united go top via fortune double suker outshin...
4,2000-09-06,1,cricket scoreboard roundup somerset slump hand...
...,...,...,...
1791,2007-12-24,1,gerrard wanted henry premier league middlesbro...
1792,2007-12-26,1,police officer dies after attack the more mega...
1793,2007-12-27,1,blears wants petitions to trigger response let...
1794,2007-12-28,0,police officer with heart disease died after m...


In [4]:
display(test_df)

Unnamed: 0,Date,Label,combined_headlines
0,2008-01-02,0,league tables to show pregcse high achievers g...
1,2008-01-03,0,how tim schafer aims to rock the virtual world...
2,2008-01-04,1,britney spears loses custody of her children d...
3,2008-01-07,0,city heading for toughest times in a generatio...
4,2008-01-08,1,believers flock to see obama in the flesh inqu...
...,...,...,...
2136,2016-06-27,0,barclays and rbs shares suspended from trading...
2137,2016-06-28,1,scientists to australia if you want to save t...
2138,2016-06-29,1,explosion at airport in istanbul yemeni former...
2139,2016-06-30,1,jamaica proposes marijuana dispensers for tour...


In [None]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import time  # Import the time module

# Check if a GPU is available and set PyTorch to use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_df.Label.unique()))
model.to(device)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 10  # Example for 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(10):  # Example for 3 epochs
    start_time = time.time()  # Start time of the epoch

    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    end_time = time.time()  # End time of the epoch
    epoch_duration = end_time - start_time  # Calculate the duration of the epoch

    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Time: {epoch_duration:.2f} seconds')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.7037282631132338, Time: 156.89 seconds
Epoch 2, Loss: 0.6999405166837904, Time: 159.47 seconds
Epoch 3, Loss: 0.6979216180907355, Time: 159.98 seconds
Epoch 4, Loss: 0.6946628581153022, Time: 159.86 seconds
Epoch 5, Loss: 0.6952204097641839, Time: 159.58 seconds
Epoch 6, Loss: 0.6934781914287144, Time: 159.46 seconds
Epoch 7, Loss: 0.6935699955622355, Time: 159.62 seconds
Epoch 8, Loss: 0.6930968697865804, Time: 159.42 seconds
Epoch 9, Loss: 0.6945347454812791, Time: 159.43 seconds
Epoch 10, Loss: 0.6935936358239916, Time: 159.47 seconds


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from torch.nn.functional import softmax


# Evaluate the model and compute probabilities
model.eval()
test_probabilities = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        positive_probs = probs[:, 1].cpu().numpy()  # Probability of the positive class
        test_probabilities.extend(positive_probs)




In [None]:
# Add probabilities to the DataFrame and save
test_df['Probability_Positive'] = test_probabilities
test_df.to_csv('/content/drive/MyDrive/test_with_probabilities.csv', index=False)

print("Test dataset with probabilities for the positive class has been saved.")

Test dataset with probabilities for the positive class has been saved.


In [None]:
df = pd.read_csv('/content/drive/MyDrive/test_with_probabilities.csv')
df['Probability_Positive'] = df['Probability_Positive'] * 100
df.head()

Unnamed: 0,Date,Label,combined_headlines,Probability_Positive
0,2008-01-02,0,league tables to show pregcse high achievers g...,51.78943
1,2008-01-03,0,how tim schafer aims to rock the virtual world...,51.78942
2,2008-01-04,1,britney spears loses custody of her children d...,51.789457
3,2008-01-07,0,city heading for toughest times in a generatio...,51.78946
4,2008-01-08,1,believers flock to see obama in the flesh inqu...,51.78945


In [None]:
df.to_csv('/content/drive/MyDrive/BD.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/gold_LSTM.csv')
df.head()

Unnamed: 0,Date,gold_Open,gold_High,gold_Low,gold_Close,gold_Adj Close,gold_Volume,Probability_Positive
0,2008-01-02,848.700012,857.799988,846.400024,857.0,857.0,130,51.78943
1,2008-01-03,863.0,865.5,856.299988,866.400024,866.400024,181,51.78942
2,2008-01-04,861.400024,865.5,860.200012,863.099976,863.099976,28,51.789457
3,2008-01-07,860.900024,860.900024,857.400024,859.599976,859.599976,4,51.78946
4,2008-01-08,861.599976,879.400024,861.599976,878.0,878.0,21,51.78945


In [None]:
display(features)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/content/drive/MyDrive/sp_LSTM.csv'  # Update this to your dataset's actual path
df = pd.read_csv(file_path)

# Assuming 'Close' is your target variable and 'Date' is to be excluded
features = df.drop(['sp_Close', 'Date'], axis=1)
target = df[['sp_Close']]

# Normalize the features and target
scaler_features = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler_features.fit_transform(features)
scaler_target = MinMaxScaler(feature_range=(0, 1))
scaled_target = scaler_target.fit_transform(target)

# Split data into training+calibration and testing sets (e.g., 80% for training+calibration, 20% for testing)
X_train_calib, X_test, y_train_calib, y_test = train_test_split(scaled_features, scaled_target, test_size=0.2, shuffle=False)

# Further split training+calibration into separate training and calibration sets (e.g., 90% for training, 10% for calibration of the remaining)
X_train, X_calib, y_train, y_calib = train_test_split(X_train_calib, y_train_calib, test_size=0.1, shuffle=False)

# Generate sequences for training
look_back = 30  # Number of days to look back for prediction
train_generator = TimeseriesGenerator(np.hstack((X_train, y_train)), y_train, length=look_back, batch_size=20)

# Model Definition with Multivariate Input
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(look_back, X_train.shape[1] + 1)),  # +1 for the target variable
    LSTM(50, return_sequences=False),
    Dense(25),
    Dense(1)
])

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Model Training
model.fit(train_generator, epochs=100, verbose=1)

# Prepare calibration data generator and compute nonconformity scores
calibration_generator = TimeseriesGenerator(np.hstack((X_calib, y_calib)), y_calib, length=look_back, batch_size=1)
calibration_predictions = model.predict(calibration_generator)
actual_calibration_values = np.array([y for x, y in calibration_generator])
nonconformity_scores = np.abs(calibration_predictions.flatten() - actual_calibration_values.flatten())

# Prepare the test data generator
test_generator = TimeseriesGenerator(np.hstack((X_test, y_test)), y_test, length=look_back, batch_size=1)

# Predict on the test set
test_predictions_scaled = model.predict(test_generator)

# Apply conformal prediction for a 90% confidence interval
alpha = 0.5  # For 80% confidence
quantile = np.quantile(nonconformity_scores, 1 - alpha)
test_lower_bounds = test_predictions_scaled - quantile
test_upper_bounds = test_predictions_scaled + quantile

# Inverse transform to original scale
test_predictions = scaler_target.inverse_transform(test_predictions_scaled)
test_predicted_intervals_lower = scaler_target.inverse_transform(test_lower_bounds)
test_predicted_intervals_upper = scaler_target.inverse_transform(test_upper_bounds)
actual_test_prices = scaler_target.inverse_transform(y_test[look_back:])

# Visualization
plt.figure(figsize=(12, 6))
dates = pd.to_datetime(df['Date']).iloc[-len(actual_test_prices):]  # Adjust based on your date handling
plt.plot(dates, actual_test_prices, label='Actual Prices', color='blue')
plt.plot(dates, test_predictions, label='Predicted Prices', linestyle='--', color='red')
plt.fill_between(dates, test_predicted_intervals_lower.flatten(), test_predicted_intervals_upper.flatten(), color='grey', alpha=0.2, label='Prediction Interval (90%)')
plt.title('Price Prediction with 50% Prediction Intervals')
plt.xlabel('Date')
plt.ylabel('SP_Price')
plt.xticks(rotation=45)
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/content/drive/MyDrive/gold_LSTM.csv'
df = pd.read_csv(file_path)

# Prepare the data: Assuming 'Price' is the target variable
# If there are specific columns to drop (like 'Date' or non-numeric columns), adjust here
features = df.drop(['gold_Close', 'Date'], axis=1)  # Adjust if your dataset has different columns
target = df[['gold_Close']]

# Normalize the features and target
scaler_features = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler_features.fit_transform(features)
scaler_target = MinMaxScaler(feature_range=(0, 1))
scaled_target = scaler_target.fit_transform(target)

# Split data into training+calibration and testing sets
X_train_calib, X_test, y_train_calib, y_test = train_test_split(scaled_features, scaled_target, test_size=0.2, shuffle=False)

# Further split training+calibration into separate training and calibration sets
X_train, X_calib, y_train, y_calib = train_test_split(X_train_calib, y_train_calib, test_size=0.1, shuffle=False)

# Generate sequences for training
look_back = 30
train_generator = TimeseriesGenerator(np.hstack((X_train, y_train)), y_train, length=look_back, batch_size=20)

# Model Definition with Multivariate Input
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(look_back, X_train.shape[1] + 1)),  # +1 for the target variable
    LSTM(50, return_sequences=False),
    Dense(25),
    Dense(1)
])

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(train_generator, epochs=100, verbose=1)

# Prepare calibration data generator and compute nonconformity scores
calibration_generator = TimeseriesGenerator(np.hstack((X_calib, y_calib)), y_calib, length=look_back, batch_size=1)
calibration_predictions = model.predict(calibration_generator)
actual_calibration_values = np.array([y for x, y in calibration_generator])
nonconformity_scores = np.abs(calibration_predictions.flatten() - actual_calibration_values.flatten())

# Prepare the test data generator
test_generator = TimeseriesGenerator(np.hstack((X_test, y_test)), y_test, length=look_back, batch_size=1)

# Predict on the test set
test_predictions_scaled = model.predict(test_generator)

# Apply conformal prediction for a 80% confidence interval
alpha = 0.5
quantile = np.quantile(nonconformity_scores, 1 - alpha)
test_lower_bounds = test_predictions_scaled - quantile
test_upper_bounds = test_predictions_scaled + quantile

# Inverse transform to original scale
test_predictions = scaler_target.inverse_transform(test_predictions_scaled)
test_predicted_intervals_lower = scaler_target.inverse_transform(test_lower_bounds)
test_predicted_intervals_upper = scaler_target.inverse_transform(test_upper_bounds)
actual_test_prices = scaler_target.inverse_transform(y_test[look_back:])

# Visualization
plt.figure(figsize=(12, 6))
dates = pd.to_datetime(df['Date']).iloc[-len(actual_test_prices):]  # Assumes 'Date' column for plotting
plt.plot(dates, actual_test_prices, label='Actual Prices', color='blue')
plt.plot(dates, test_predictions, label='Predicted Prices', linestyle='--', color='red')
plt.fill_between(dates, test_predicted_intervals_lower.flatten(), test_predicted_intervals_upper.flatten(), color='grey', alpha=0.2, label='Prediction Interval (90%)')
plt.title('Price Prediction with 50% Prediction Intervals')
plt.xlabel('Date')
plt.ylabel('Gold_Price')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
