In [None]:
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re

In [None]:
data = pd.read_csv(r'path')
data.head

In [None]:
data = data[['text','target']]
data.head

In [None]:
pos = data[data['target']==4]
neg = data[data['target']==0]

In [None]:
random_pos = pos.sample(n = 799999, random_state = 5)
random_neg = neg.sample(n = 799999, random_state = 5)

In [None]:
frames = [random_pos, random_neg]
data = pd.concat(frames)
data

In [None]:
for row in data:
    data.loc[data['target'] == 0, 'target'] = 'Negative'
    data.loc[data['target'] == 4, 'target'] = 'Positive'

In [None]:
data

In [None]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
data

In [None]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding='post', maxlen=40)

X.shape

# **Convolutional Neural Network**

In [None]:
# Setting hyperparameters

embed_size = 128

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import *

overfitCallback = EarlyStopping(monitor='val_loss', min_delta=0, patience = 3)

In [None]:
cnn = Sequential([
    Embedding(max_features, embed_size, input_length = X.shape[1]),
    Conv1D(128, 3, padding='same', activation='relu'),
    Dropout(0.4),
    Flatten(),
    Dense(2, activation='softmax')
])

cnn.summary()

In [None]:
cnn.compile(optimizer=Nadam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=1e-07),
              loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
Y = pd.get_dummies(data['target']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
history = cnn.fit(X_train, Y_train, epochs = 50, batch_size=1024, verbose = 1,
                    validation_data=(X_test, Y_test), callbacks=[overfitCallback])

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

## **LSTM**

In [None]:
lstm = Sequential([
    Embedding(max_features, embed_size, input_length = X.shape[1]),
    Bidirectional(LSTM(50, dropout=0.5, return_sequences=True)),
    GlobalMaxPool1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

lstm.summary()

In [None]:
lstm.compile(optimizer=Nadam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=1e-07),
              loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
Y = pd.get_dummies(data['target']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
history = lstm.fit(X_train, Y_train, epochs = 5, batch_size=1024, verbose = 1,
                    validation_data=(X_test, Y_test), callbacks=[overfitCallback])

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

## **Validation**

In [None]:
df_validation = pd.read_csv(r'/home/celestino/Desktop/text_analysis/tweets/covid19_tweets_labeled2.csv')
df_validation

In [None]:
max_features = 12
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_validation['text'].values)
X_2 = tokenizer.texts_to_sequences(df_validation['text'].values)
X_val = pad_sequences(X_2, maxlen=40, padding='post')
Y_val = pd.get_dummies(df_validation['Analysis']).values
print(X_val.shape,Y_val.shape)

In [None]:
score,acc = cnn.evaluate(X_val, Y_val, verbose = 1, batch_size = 32)
print("CNN score: %.2f" % (score))
print("CNN acc: %.2f" % (acc))

In [None]:
score,acc = lstm.evaluate(X_val, Y_val, verbose = 1, batch_size = 32)
print("LSTM score: %.2f" % (score))
print("LSTM acc: %.2f" % (acc))