<a href="https://colab.research.google.com/github/cisada/sentimentment-analysis-using-BERT/blob/main/sentiment_analysis_(Twitter_dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TWITTER SENTIMENT ANALYSIS

In [None]:
#import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re


### 1 . DATA COLLECTION

In [None]:
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',
                 encoding = 'latin',header=None)
data.head()

### 2. DATA Labelling

In [None]:
data.columns = ['sentiment','id','date','query','user','text']
data.head()

In [None]:
data.drop(['id','date','query','user'],axis=1,inplace=True)
data.head()

In [None]:
lab_to_sentiment = {0:"Negative", 4:"Positive"}

def label_decoder(label):
  return lab_to_sentiment[label]
data['sentiment'] = data['sentiment'].apply(lambda x: label_decoder(x))
data.head()

In [None]:
val_count = data['sentiment'].value_counts()
val_count

In [None]:
plt.figure(figsize=(8,6))
plt.bar(val_count.index,val_count.values)
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.title("Sentiment Data Distribution")
plt.show()

In [None]:
data.sample(n=10)

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
test_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'

In [None]:
def preprocess(text,stem=False):
  text = re.sub(test_cleaning_re,' ',str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: preprocess(x))
data.head()

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(20,20))
wc = WordCloud(max_words=1000,width=1600,height=800).generate(" ".join(data[data['sentiment']=='Positive']['text']))
plt.imshow(wc,interpolation='bilinear')

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000,width = 1600,height = 800).generate(" ".join(data[data['sentiment']=='Negative']['text']))
plt.imshow(wc,interpolation = 'bilinear')

### 6. MODEL TRAINING

In [None]:
TRAIN_SIZE = 0.8
MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 200

In [None]:
train_data, test_data = train_test_split(data, test_size=1 - TRAIN_SIZE, random_state=7)
print("Train Data Size:", len(train_data))
print("Test Data Size", len(test_data))

In [None]:
train_data.head(10)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
word_index = tokenizer.word_index
vocab_size = len(word_index)
print("Vocabulary Size :", vocab_size)

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']),
                        maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']),
                       maxlen=MAX_SEQUENCE_LENGTH)
print("Training Data Tensor Shape:", x_train.shape)
print("Testing Data Tensor Shape:", x_test.shape)

In [None]:
labels = train_data['sentiment'].unique().tolist()

### label encoding


In [None]:
encoder = LabelEncoder()
encoder.fit(train_data.sentiment.tolist())

y_train = encoder.transform(train_data.sentiment.tolist())
y_test = encoder.transform(test_data.sentiment.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
#downloading pretrained GloVe word embedded
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
GLOVE_EMB = 'glove.6B.200d.txt'
embeddings_index = {}
f = open(GLOVE_EMB)
for line in f:
  values = line.split()
  word = value = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

In [None]:
embeddings_index = {}
f = open(GLOVE_EMB)
for line in f:
  values = line.split()
  word = value = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((vocab_size+1,EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding

improved vocal_size+1 ==> vocal_size

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size+1,
                                          EMBEDDING_DIM,
                                          weights=[embedding_matrix],
                                          input_length=MAX_SEQUENCE_LENGTH,
                                          trainable=False)

In [None]:
pip install keras-tuner


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, BatchNormalization, Activation, Dropout, MaxPooling1D, Concatenate, Dense, GlobalMaxPooling1D, Softmax, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_tuner.tuners import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def build_model(hp):
  inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
  # Embedding layer using the pre-trained embedding matrix
  embedding = embedding_layer(inputs)

  #first convolutional path
  conv1 = Conv1D(filters=hp.Int('conv1_filters', min_value=32, max_value=256, step=32),
                 kernel_size = hp.Int('conv1_kernal_size', min_value=2, max_value=5),
                 padding = 'valid',activation='relu')(embedding)
  conv1 = BatchNormalization()(conv1)
  conv1 = Dropout(hp.Float('conv1_dropout', min_value=0.2, max_value=0.5, step=0.1))(conv1)
  conv1 = MaxPooling1D(pool_size=2)(conv1)

  #Second convolutional layer
  conv2 = Conv1D(filters=hp.Int('conv2_filters', min_value=32, max_value=256, step=32),
                 kernel_size = hp.Int('conv2_kernal_size', min_value=2, max_value=5),
                 padding = 'valid',activation='relu')(embedding)
  conv2 = BatchNormalization()(conv2)
  conv2 = Dropout(hp.Float('conv2_dropout', min_value=0.2, max_value=0.5, step=0.1))(conv2)
  conv2 = MaxPooling1D(pool_size=2)(conv2)

  #Concatenate convolutional outputs
  concat = Concatenate(axis=-1)([conv1, conv2])
  flatten = Flatten()(concat)

  #Fully connected layer
  fc_units = hp.Int('fc_units', min_value=128, max_value=512, step=64)
  fc = Dense(units=fc_units, activation='relu')(flatten)

  #softmax layer
  num_classes = 1
  output = Dense(units=num_classes, activation='sigmoid')(fc)

  model = Model(inputs=inputs, outputs=output)

  #compile the model
  optimizer = Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log'))
  model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

  return model

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)


In [None]:
!pip install keras-tuner wordcloud

In [None]:
# Instantiate the tuner
tuner = RandomSearch(build_model,
                     objective='val_accuracy',
                     max_trials=5,
                     directory='twitter_sentiment_analysis',
                     project_name='twitter_sentiment_analysis')

# Perform the hyperparameter search
tuner.search(x_train, y_train,
             epochs=5,
             validation_split=0.2,
             callbacks=[early_stopping])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Train the model
history = best_model.fit(x_train, y_train,
                         epochs=10,
                         validation_data=(x_test, y_test),
                         callbacks=[early_stopping])


### Model Evaluation

In [None]:
s, (at,al) = plt.subplots(2,1,figsize=(16,10))
at.plot(history.history['accuracy'], c='b')
at.plot(history.history['val_accuracy'], c='r')
at.set_title('model accuracy')
at.set_ylabel('accuracy')
at.set_xlabel('epoch')
at.legend(['train','validation'], loc='upper left')

al.plot(history.history['loss'], c='m')
al.plot(history.history['val_loss'], c='c')
al.set_title('model loss')
al.set_ylabel('loss')
al.set_xlabel('epoch')
al.legend(['train','validation'], loc='upper left')


In [None]:
def decode_sentiment(score):
   return 'Positive' if score>0.5 else 'Negative'
scores = best_model.predict(x_test, verbose=1, batch_size=8000)
y_pred_1d = [decode_sentiment(score) for score in scores]

In [None]:
#confusion matrix

import itertools
form sklearn.metrics import confusion_matrix, classification_report, accuracy_score


def plot_confusion_matrix(cm, classes, title = 'confusion matrix', cmap=plt.cm.Blues):
  cm = cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
  plt.imshow(cm, interpolation='nearest', cmap=cmap)
  plt.title(title)
  plt.colorbar()
  ticks_marks = np.arange(len(classes))
  plt.xticks(ticks_marks, classes, fontsize=13)
  plt.yticks(ticks_marks, classes, fontsize=13)



  fmt = '.2f'
  thresh = cm.max()/2
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
             horizontalalignment = 'center',
             color = 'white' if cm[i, j] > thresh else 'black')
  plt.ylabel('True label', font_size=20)
  plt.xlabel('Predicted label', font_size=20)
  plt.tight_layout()

In [None]:
cf_matrix = confusion_matrix(test_data.sentiment.to_list(), y_pred_1d)
plt.figure(figsize=(8,8))
plot_confusion_matrix(cf_matrix, classes=test_data.sentiment.unique(), title = 'confusion matrix')
plt.show()

In [None]:
#classification report
print("Classification repoert")
print(classification_report(test_data.sentiment.to_list(), y_pred_1d))

In [None]:
#ROC Curve
y_true_binary = np.where(np.array(test_data.sentiment.to_list()) == 'Positive', 1, 0)
fpr, tpr, _ = roc_curve(y_true_binary, scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,8))
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1)], [0,1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', font_size=20)
plt.ylabel('True Positive Rate', font_size=20)
plt.title('Receiver Operating Characteristic', font_size=20)
plt.legend(loc='lower right')
plt.show()


In [None]:
#PR Curve
precision, recall, _ = precision_recall_curve(y_true_binary, scores)
average_precision = average_precision_score(y_true_binary, scores)

plt.figure(figsize=(8,8))
plt.step(recall, precision, color='darkblue', lw=2, label='PR Curve(area - %0.2f)' % average_precision)
plt.fill_between(recall, precision, step='post', alpha=0.2, color='darkblue')
plt.xlabel('Recall', font_size=20)
plt.ylabel('Precision', font_size=20)
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve')
plt.legend(loc="lower left")
plt.show()