# Part 1 - Classification of Tweets


In part 1, we first want to read in all of the data from our dataset, which is stored in 'Twitter_Data.csv'. We see that each row of this data has only 2 columns - a cleaned text 'clean_text' and a catgory. Here, the 'clean_text' represents the actual tweet and the category determines whether the tweet is positive, negative, or neutral, with a 1 being positive 0 being neutral and -1 being negative. We'll read in this data to an object named tweets, and then print the counts of the number of positive, neutral, and negative tweets.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# Load the data

tweets = pd.read_csv('Twitter_Data.csv')
tweets['clean_text'] = tweets['clean_text'].astype(str).fillna('')
tweets['category'] = tweets['category'].fillna(0)  # Replace NaN with a neutral sentiment or appropriate value

# Shuffle data in order to ensure randomization
tweets = tweets.sample(frac=1)

# Get rows with emojis
tweets_with_emojis = tweets[tweets['clean_text'].str.contains(r'[\u263a-\U0001f645]')]

# Initialize lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

tweets.dropna()

def cleaning_and_preprocessing(tweet):
    tweet = re.sub(r'http\S+|www\S+|@\S+|#\S+|[^A-Za-z0-9\s]', '', tweet)
    # Tokenize and lemmatize
    words = tweet.split()
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the 'clean_text' column
tweets['clean_text'] = tweets['clean_text'].apply(cleaning_and_preprocessing)
tweets = tweets[tweets['clean_text'].apply(lambda x: x.strip() != '')]


In [3]:
print(tweets_with_emojis.shape[0]) # Due to less than .01% of the tweets containing emojis, we will not need to consider emoji classification

1834


In [4]:
## get negative, neutral, and positive tweets from the training dataset
negative = tweets.loc[tweets['category'] == -1]
neutral = tweets.loc[tweets['category'] == 0]
positive = tweets.loc[tweets['category'] == 1]

# get count of each new dataframe
neg_count = negative.shape[0]
neutral_count = neutral.shape[0]
pos_count = positive.shape[0]

In [5]:
print(neg_count)
print(neutral_count)
print(pos_count)
print(negative)
print(neutral)
print(positive)

35509
55157
72245
                                               clean_text  category
129605                      narendra modi killed shashtri      -1.0
128942  anything big india achieves naamdaars sad doub...      -1.0
28644   day chandrababu sir talking kcr sirpm modi sir...      -1.0
77301   beef export country increased modi govt muslim...      -1.0
33506   want freebie also know modi work change countr...      -1.0
...                                                   ...       ...
142994  hit nail head arguing modi bhakt like throwing...      -1.0
95292   date night modiin dinner meat meet one faves l...      -1.0
112317  exit airport gate near ola pickup point big ho...      -1.0
20122   dhokla asaduddin owaisi double beef attack mod...      -1.0
20169   sir winter gone suggest modi tshirt sale incre...      -1.0

[35509 rows x 2 columns]
                                               clean_text  category
150479  mama bigger congress country narendra modi ass...       0.0
9701

# Part 2 - Text vectorisation methods

First we will use the TF-IDF method with tokenisation to get a bigram respresentation of our input, and visualize the results below.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [7]:
## TF-IDF with Logistic Regression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2), min_df=5, max_df=0.7)

# Fit and transform the tweets
X = tfidf_vectorizer.fit_transform(tweets['clean_text']).toarray()
y = tweets['category']

# 20% test, 10% validation, 70% training

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize and train the classifier (Logistic Regression in this case)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

tfidf_with_logistic_regression_accuracy = accuracy_score(y_test, y_pred)
print(tfidf_with_logistic_regression_accuracy)


              precision    recall  f1-score   support

        -1.0       0.78      0.55      0.65      7002
         0.0       0.71      0.93      0.81     11130
         1.0       0.86      0.78      0.82     14451

    accuracy                           0.78     32583
   macro avg       0.79      0.75      0.76     32583
weighted avg       0.79      0.78      0.78     32583

0.7819415032378848


In [8]:
# TF_IDF with Multinomial NB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2), min_df=5, max_df=0.7)  # Limit to top 2000 features for simplicity

# Fit and transform the tweets
X = tfidf_vectorizer.fit_transform(tweets['clean_text']).toarray()
y = tweets['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Multinomial NB classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

tfidf_with_mnb_accuracy = accuracy_score(y_test, y_pred)
print(tfidf_with_mnb_accuracy)



              precision    recall  f1-score   support

        -1.0       0.83      0.35      0.49      7002
         0.0       0.69      0.60      0.64     11130
         1.0       0.62      0.86      0.72     14451

    accuracy                           0.66     32583
   macro avg       0.71      0.60      0.62     32583
weighted avg       0.69      0.66      0.64     32583

0.6612957677316392


In [9]:
# Bag of Words with Logistic Regression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

count_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)  # Limit to top 1000 features for simplicity

# Fit and transform the cleaned text data
X = count_vectorizer.fit_transform(tweets['clean_text']).toarray()
y = tweets['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier (Logistic Regression in this case)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

bow_with_logistic_regression_accuracy = accuracy_score(y_test, y_pred)
print(bow_with_logistic_regression_accuracy)


              precision    recall  f1-score   support

        -1.0       0.78      0.56      0.65      7002
         0.0       0.71      0.94      0.81     11130
         1.0       0.87      0.77      0.82     14451

    accuracy                           0.78     32583
   macro avg       0.79      0.76      0.76     32583
weighted avg       0.80      0.78      0.78     32583

0.7826167019611454


In [10]:
# Bag of Words with Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

count_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)  # Limit to top 1000 features for simplicity

# Fit and transform the cleaned text data
X = count_vectorizer.fit_transform(tweets['clean_text']).toarray()
y = tweets['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Multinomial NB classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

bow_with_mnb_accuracy = accuracy_score(y_test, y_pred)
print(bow_with_mnb_accuracy)


              precision    recall  f1-score   support

        -1.0       0.78      0.56      0.65      7002
         0.0       0.71      0.94      0.81     11130
         1.0       0.87      0.77      0.82     14451

    accuracy                           0.78     32583
   macro avg       0.79      0.76      0.76     32583
weighted avg       0.80      0.78      0.78     32583

0.7826167019611454


# Comparing Accuracy

In [11]:
print(f"Tf-idf with Logistic Regression: {tfidf_with_logistic_regression_accuracy:.2f}")
print(f"Tf-idf with Multinomial Naive Bayes: {tfidf_with_mnb_accuracy:.2f}")
print(f"Bag of words with Logistic Regression: {bow_with_logistic_regression_accuracy:.2f}")
print(f"Bag of words with Multinomial Naive Bayes: {bow_with_mnb_accuracy:.2f}")

Tf-idf with Logistic Regression: 0.78
Tf-idf with Multinomial Naive Bayes: 0.66
Bag of words with Logistic Regression: 0.78
Bag of words with Multinomial Naive Bayes: 0.78


# Part 2 - Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(tweets['clean_text']).toarray()
y = tweets['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=70, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)

# Evaluate the model
print(f'Accuracy: {rf_accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.8380443789706289
              precision    recall  f1-score   support

        -1.0       0.83      0.66      0.73      7002
         0.0       0.80      0.94      0.87     11130
         1.0       0.87      0.85      0.86     14451

    accuracy                           0.84     32583
   macro avg       0.84      0.81      0.82     32583
weighted avg       0.84      0.84      0.83     32583



# CNN

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras import regularizers

# Split the data into training and test sets
X = tweets['clean_text']
y = tweets['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Tokenize, convert to sequences, and pad the sequences 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Load pretrained GloVe embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the CNN model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix]),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_padded, y_train,
    validation_data=(X_test_padded, y_test),
    epochs=8,
    batch_size=128,
)

# Predict on the test set
y_pred = model.predict(X_test_padded)
print(classification_report(y_test, y_pred))

# Evaluate the model
loss, cnn_accuracy = model.evaluate(X_test_padded, y_test)
print(f'Accuracy: {cnn_accuracy}')

Epoch 1/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 21ms/step - accuracy: 0.4476 - loss: -52.6002 - val_accuracy: 0.5633 - val_loss: -2322.3687
Epoch 2/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 21ms/step - accuracy: 0.5625 - loss: -11260.3457 - val_accuracy: 0.5873 - val_loss: -80175.5859
Epoch 3/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.5758 - loss: -166387.4375 - val_accuracy: 0.5881 - val_loss: -544302.3125
Epoch 4/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.5861 - loss: -870354.3750 - val_accuracy: 0.5889 - val_loss: -2000135.0000
Epoch 5/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.5886 - loss: -2900356.0000 - val_accuracy: 0.5972 - val_loss: -5384489.5000
Epoch 6/8
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.5909 - loss: -728

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5917 - loss: -42557760.0000
Accuracy: 0.5940338969230652


# Comparing all accuracies

In [18]:
print(f"TF-IDF with Logistic Regression: {tfidf_with_logistic_regression_accuracy:.2f}")
print(f"Tf-IDF with Multinomial Naive Bayes: {tfidf_with_mnb_accuracy:.2f}")
print(f"Bag of words with Logistic Regression: {bow_with_logistic_regression_accuracy:.2f}")
print(f"Bag of words with Multinomial Naive Bayes: {bow_with_mnb_accuracy:.2f}")
print(f"TF-IDF with Random Forest: {rf_accuracy:.2f}")
print(f"CNN: {cnn_accuracy:.2f}")

TF-IDF with Logistic Regression: 0.78
Tf-IDF with Multinomial Naive Bayes: 0.66
Bag of words with Logistic Regression: 0.78
Bag of words with Multinomial Naive Bayes: 0.78
TF-IDF with Random Forest: 0.84
CNN: 0.60
