In [1]:
# import the following libraries into the Python environment

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [2]:
# Load the dataset
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
# Selecting only the 'tweet_text' and 'is_there_an_emotion_directed_at_a_brand_or_product' columns for sentiment analysis

data=data[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]

In [5]:
# Rename columns for simplicity

data.columns = ['tweet', 'sentiment']

In [6]:
data.shape

(9093, 2)

In [8]:
# Display full content of DataFrame columns

pd.set_option('display.max_colwidth',None)

In [9]:
 data.head()

Unnamed: 0,tweet,sentiment
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


In [10]:
data['sentiment'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: sentiment, dtype: int64

# Preprocess the data

In [11]:
# Count the number of missing values in the 'tweet' column

data['tweet'].isnull().sum()

1

In [12]:
# handle missing values

data['tweet'].fillna('', inplace=True)

In [13]:
# Extracting input features (x) and target labels (y) from the dataset

x = data['tweet']
y = data['sentiment']

In [14]:
y

0                         Negative emotion
1                         Positive emotion
2                         Positive emotion
3                         Negative emotion
4                         Positive emotion
                       ...                
9088                      Positive emotion
9089    No emotion toward brand or product
9090    No emotion toward brand or product
9091    No emotion toward brand or product
9092    No emotion toward brand or product
Name: sentiment, Length: 9093, dtype: object

In [15]:
# Encode the target variable

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)


In [16]:
y

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]], dtype=float32)

In [17]:
# Tokenize the text data using Keras Tokenizer

from keras.preprocessing import text
tokenizer=text.Tokenizer()
tokenizer.fit_on_texts(list(data['tweet']))
tokenized_text=tokenizer.texts_to_sequences(data['tweet'])

In [19]:
len(tokenized_text[0])

24

In [20]:
len(tokenized_text[1])

22

In [21]:
# Pad the tokenized_text to make all text sequences the same length (100)

from keras.utils import pad_sequences
x=pad_sequences(tokenized_text,maxlen=100)

In [22]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# SimpleRNN

In [24]:
# Define a sequential model with Embedding, SimpleRNN, Dropout, Dense, and Softmax layers

from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,SimpleRNN,Dropout

In [23]:
len(tokenizer.word_index)

10147

In [36]:
# Create a Sequential model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim=128, input_length=100))

# Add a SimpleRNN layer with 32 units

model.add(SimpleRNN(32))
model.add(Dropout(0.5))

# Add a Dense layer with 50 units
#model.add(Dense(50,activation = 'relu'))
#model.add(Dropout(0.5))

# Add the final Dense layer with 4 units (for 4 classes)
model.add(Dense(4, activation='softmax'))

In [37]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [38]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          1298944   
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                5152      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 4)                 132       
                                                                 
Total params: 1304228 (4.98 MB)
Trainable params: 1304228 (4.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
model.fit(x_train,y_train,epochs=10,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7db477dc7e80>

In [40]:
y_pred=model.predict(x_test)



In [58]:
accuracy = model.evaluate(x_test, y_test)[1]
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 60.69%


In [59]:
thresholds = [0.25, 0.5, 0.75]  # Adjust these thresholds

# Initialize an array with zeros for the predicted labels
y_pred_classes = np.zeros_like(y_pred, dtype=int)

# Assign class labels based on thresholds
for i, threshold in enumerate(thresholds):
    y_pred_classes[:, i] = (y_pred[:, i] > threshold).astype(int)

In [60]:
from sklearn.metrics import jaccard_score


jaccard_similarity = jaccard_score(y_test, y_pred_classes, average='weighted')
print(f'Jaccard Similarity: {jaccard_similarity * 100:.2f}%')

Jaccard Similarity: 30.03%


# LSTM

In [61]:
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim=128, input_length=100))

model.add(LSTM(32))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train,y_train,epochs=10,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7db4779aead0>

In [62]:
y_pred=model.predict(x_test)



In [63]:
accuracy = model.evaluate(x_test, y_test)[1]
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 65.64%


In [64]:
thresholds = [0.25, 0.5, 0.75]  # Adjust these thresholds

# Initialize an array with zeros for the predicted labels
y_pred_classes = np.zeros_like(y_pred, dtype=int)

# Assign class labels based on thresholds
for i, threshold in enumerate(thresholds):
    y_pred_classes[:, i] = (y_pred[:, i] > threshold).astype(int)

In [65]:
from sklearn.metrics import jaccard_score


jaccard_similarity = jaccard_score(y_test, y_pred_classes, average='weighted')
print(f'Jaccard Similarity: {jaccard_similarity * 100:.2f}%')

Jaccard Similarity: 34.45%


## Implement early stopping to prevent overfitting

In [66]:
#early stopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Modify your model.fit call to include the early_stopping callback
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [70]:
accuracy = model.evaluate(x_test, y_test)[1]
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 64.27%
