In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score, \
                            confusion_matrix, plot_confusion_matrix, precision_recall_curve, plot_roc_curve, \
                            roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from itertools import combinations 
from keras.layers import Dense, Conv1D, GlobalMaxPool1D, Embedding
from keras.optimizers import RMSprop

In [2]:
def classifier_report(y_true, y_pred):
    
    print("Precision :  ", precision_score(y_true, y_pred))
    print("Recall    :  ", recall_score(y_true, y_pred))
    print("F1-score  :  ", f1_score(y_true, y_pred))
    print("Accuracy  :  ", accuracy_score(y_true, y_pred))
    print("ROC AUC   :  ", roc_auc_score(y_true, y_pred))
    print("Confusion Matrix : ")
    print(confusion_matrix(y_true, y_pred))

In [3]:
from nltk.sentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [4]:
import spacy, re
nlp = spacy.load("en_core_web_sm")

#### preprocessed_data.csv should not be used for training neural network as it has removed stopwords

In [5]:
data = pd.read_csv("new_processed_tweet.csv")[["tweet", "label"]]

In [6]:
test_data = pd.read_csv("new_processed_tweet_test.csv")[["tweet", "id"]]
test_id = test_data["id"]

### Model

In [7]:
label = data[["label"]]
max_len = 60

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["tweet"])
sequence = tokenizer.texts_to_sequences(data["tweet"])

sequence = pad_sequences(sequence, maxlen=max_len, padding="post")

In [8]:
one_hot_encoder = OneHotEncoder()
label_nn = one_hot_encoder.fit_transform(label)

In [9]:
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len))
model.add(Conv1D(filters=30, kernel_size=3, activation="relu"))
model.add(Conv1D(filters=10, kernel_size=3, activation="relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation="sigmoid"))
model.add(Dense(8, activation="sigmoid"))
model.add(Dense(2, activation="softmax"))

model.compile(loss="binary_crossentropy", optimizer=RMSprop(), metrics=["accuracy"])

In [10]:
weight = {0: 40, 1:60}
history = model.fit(sequence, label_nn.todense(), batch_size=8, epochs=8, validation_split=0.2, verbose=2, class_weight=weight)

Epoch 1/8
792/792 - 2s - loss: 23.5596 - accuracy: 0.7790 - val_loss: 0.3333 - val_accuracy: 0.8567
Epoch 2/8
792/792 - 2s - loss: 13.6777 - accuracy: 0.8815 - val_loss: 0.2828 - val_accuracy: 0.8706
Epoch 3/8
792/792 - 2s - loss: 11.2914 - accuracy: 0.9045 - val_loss: 0.2731 - val_accuracy: 0.8807
Epoch 4/8
792/792 - 2s - loss: 10.3256 - accuracy: 0.9156 - val_loss: 0.2786 - val_accuracy: 0.8782
Epoch 5/8
792/792 - 2s - loss: 9.8334 - accuracy: 0.9195 - val_loss: 0.2753 - val_accuracy: 0.8819
Epoch 6/8
792/792 - 2s - loss: 9.5229 - accuracy: 0.9195 - val_loss: 0.2776 - val_accuracy: 0.8819
Epoch 7/8
792/792 - 2s - loss: 9.0376 - accuracy: 0.9246 - val_loss: 0.2702 - val_accuracy: 0.8826
Epoch 8/8
792/792 - 2s - loss: 8.7197 - accuracy: 0.9287 - val_loss: 0.2830 - val_accuracy: 0.8819


In [11]:
model.save_weights("convolutional_weights.h5")

In [12]:
test_sequence = tokenizer.texts_to_sequences(test_data["tweet"])
test_sequence = pad_sequences(test_sequence, maxlen=max_len, padding="post")

In [13]:
prediction = pd.DataFrame({"label": model.predict_classes(test_sequence)})
prediction.to_csv("convolutional_prediction.csv")

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [14]:
prediction["label"].value_counts(normalize=True)
# 0    0.702509
# 1    0.297491

0    0.698413
1    0.301587
Name: label, dtype: float64

In [15]:
# prediction = model.predict_classes(test_sequence)
# classifier_report(test_data["label"], prediction)

In [16]:
# import random
# random.seed(98)
# incorrect = [pred != true for pred, true in zip(prediction, test_data["label"])]
# index = random.sample(range(0, len(test_data[incorrect])), 10)
# for tweet, label in zip(test_data[incorrect]["tweet"].iloc[index], test_data[incorrect]["label"].iloc[index]):
#     print("Label : ", label)
#     print(tweet, end="\n\n")

In [17]:
# def prediction(tweet):
    
#     sequence = tokenizer.texts_to_sequences(tweet)
#     sequence = pad_sequences(sequence, maxlen=40, padding="post")
#     return model.predict_classes(sequence)