In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


LSTM Version

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

# Rename the columns
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Keep only the 'text' and 'target' columns
df = df[['text', 'target']]

# Convert the target values to integers (0 = negative, 2 = neutral, 4 = positive)
df['target'] = df['target'].replace({0: 0, 2: 1, 4: 2})

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42)

# Split the data into training and validation sets
train_size = int(0.8 * len(df))
train_data = df[:train_size]
val_data = df[train_size:]

# Tokenize the tweets
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])

# Convert the text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
val_sequences = tokenizer.texts_to_sequences(val_data['text'])

# Pad the sequences to a fixed length
max_seq_length = 100
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_seq_length, padding='post', truncating='post')
val_sequences = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Define the model architecture
input_layer = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=10000, output_dim=64)(input_layer)
lstm_layer = LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
dropout_layer = Dropout(rate=0.5)(lstm_layer)
output_layer = Dense(units=3, activation='softmax')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
num_epochs = 10
batch_size = 64
history = model.fit(train_sequences, train_data['target'], batch_size=batch_size, epochs=num_epochs, validation_data=(val_sequences, val_data['target']))

# Save the model
model.save(' transformer_model.h5')

# Tokenize the tweets to predict
#tweets = ['This is a positive tweet', 'This is a negative tweet', 'This tweet is neutral']

#tokenized_tweets = tokenizer.texts_to_sequences(tweets)
#padded_tweets = tf.keras.preprocessing.sequence.pad_sequences(tokenized_tweets, maxlen=max_seq_length, padding='post', truncating='post')

# Predict the subjectivity and polarity of the tweets
#predictions = model.predict(padded_tweets)

#subjectivity = ['subjective' if np.argmax(pred) != 1 else 'objective' for pred in predictions]
#polarity = ['positive' if np.argmax(pred) == 2 else 'negative' for pred in predictions]

#print('Subjectivity:', subjectivity)
#print('Polarity:', polarity)


Epoch 1/10

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('bitcoin_tweets.csv')

# Tokenize the tweets
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Tweet'])

# Convert the text to sequences
sequences = tokenizer.texts_to_sequences(df['Tweet'])

# Pad the sequences to a fixed length
max_seq_length = 100
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Predict the subjectivity and polarity of the tweets
predictions = model.predict(padded_sequences)

subjectivity = ['subjective' if np.argmax(pred) != 1 else 'objective' for pred in predictions]
polarity = ['positive' if np.argmax(pred) == 2 else 'negative' for pred in predictions]

# Add the subjectivity and polarity columns to the original DataFrame
df['subjectivity'] = subjectivity
df['polarity'] = polarity

# Save the updated DataFrame to a new CSV file
df.to_csv('bitcoin_sentiments.csv', index=False)



Transformer version (Using MultiHeadAttention)

In [25]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

# Rename the columns
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Keep only the 'text' and 'target' columns
df = df[['text', 'target']]

# Convert the target values to integers (0 = negative, 2 = neutral, 4 = positive)
df['target'] = df['target'].replace({0: 0, 2: 1, 4: 2})

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_texts, train_labels = train_df['text'].tolist(), train_df['target'].tolist()
val_texts, val_labels = val_df['text'].tolist(), val_df['target'].tolist()

# Set the parameters for the model
sequence_length = 100
vocab_size = 10000
embedding_dim = 64
num_transformer_blocks = 2
num_heads = 4
num_classes = 3

# Create a TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=sequence_length, pad_to_max_tokens=True
)
vectorize_layer.adapt(train_texts)

In [26]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.add_attention = tf.keras.layers.Add()
        self.norm_attention = tf.keras.layers.LayerNormalization()
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(embedding_dim, activation='relu'),
            tf.keras.layers.Dense(embedding_dim)
        ])
        self.add_ffn = tf.keras.layers.Add()
        self.norm_ffn = tf.keras.layers.LayerNormalization()

    def call(self, inputs, **kwargs):
        attention_output = self.attention(query=inputs, key=inputs, value=inputs)
        attention_output = self.add_attention([inputs, attention_output])
        attention_output = self.norm_attention(attention_output)

        ffn_output = self.ffn(attention_output)
        ffn_output = self.add_ffn([attention_output, ffn_output])
        ffn_output = self.norm_ffn(ffn_output)

        return ffn_output

def build_transformer_model(sequence_length, vocab_size, embedding_dim, num_transformer_blocks, num_heads, num_classes):
    inputs = tf.keras.layers.Input(shape=(sequence_length,))
    embeddings = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = embeddings

    for _ in range(num_transformer_blocks):
        x = TransformerBlock(embedding_dim, num_heads)(x)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)


In [None]:
# Build the model
model = build_transformer_model(sequence_length, vocab_size, embedding_dim, num_transformer_blocks, num_heads, num_classes)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels)).batch(32).map(lambda x, y: (vectorize_layer(x), y))
val_ds = tf.data.Dataset.from_tensor_slices((val_texts, val_labels)).batch(32).map(lambda x, y: (vectorize_layer(x), y))

history = model.fit(train_ds, validation_data=val_ds, epochs=20)

# Save the model
model.save('sentiment_transformer_model')

In [28]:
# Save the model in the HDF5 format
model.save('my_transformer_model.h5')


In [31]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('bitcoin_tweets.csv')

# Extract the 'Tweet' column from the DataFrame
tweets = df['Tweet'].tolist()

# Load the pre-trained sentiment analysis model
loaded_model = tf.keras.models.load_model('sentiment_transformer_model')

# Tokenize the tweets
tokenized_tweets = vectorize_layer(tweets)

# Predict the sentiment of each tweet
sentiment_scores = loaded_model.predict(tokenized_tweets)

# Process the sentiment scores to get percentage values, subjectivity, and polarity
sentiment_results = []
for score in sentiment_scores:
    negative_percentage = score[0] * 100
    positive_percentage = score[1] * 100
    neutral_percentage = score[2] * 100
    sentiment = np.argmax(score)
    sentiment_label = 'negative' if sentiment == 0 else 'positive' if sentiment == 1 else 'neutral'
    subjectivity = 'subjective' if max(score) > 0.5 else 'objective'
    sentiment_results.append({'negative': negative_percentage, 'neutral': neutral_percentage, 'positive': positive_percentage, 'polarity': sentiment_label, 'subjectivity': subjectivity})

sentiment_df = pd.DataFrame(sentiment_results)

# Add the subjectivity, polarity, and polarity percentages columns to the original DataFrame
df['subjectivity'] = sentiment_df['subjectivity'].apply(lambda x: 1 if x == 'subjective' else 0)
df['objectivity'] = sentiment_df['subjectivity'].apply(lambda x: 1 if x == 'objective' else 0)
df['polarity'] = sentiment_df['polarity']
df['negative'] = sentiment_df['negative']
df['neutral'] = sentiment_df['neutral']
df['positive'] = sentiment_df['positive']

# Save the updated DataFrame to a new CSV file
df.to_csv('bitcoin_sentiments.csv', index=False)



In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,subjectivity,objectivity,polarity,negative,neutral,positive
0,0,2013-01-08 05:51:56-05:00,sangyoupunk,bitcoinJPY more buying pushing the price up to...,1,0,neutral,28.22884,71.770781,0.000386
1,1,2013-01-07 21:04:30-05:00,jimaylu,Bitcoin is a decentralized digital currency ba...,1,0,neutral,43.07965,56.919914,0.000436
2,2,2013-01-10 16:50:55-05:00,sangyoupunk,bitcoinJPY overnight the price pushed up to ¥1...,1,0,negative,68.091732,31.907734,0.00054
3,3,2013-01-15 02:53:23-05:00,sangyoupunk,bitcoinJPY knocking on the door of ¥1300 again...,1,0,neutral,39.862031,60.137457,0.000516
4,4,2013-01-17 20:56:15-05:00,absinthol,"In less than 24 hours, the exchange price of a...",1,0,negative,78.113067,21.886313,0.000619
