In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import keras
from keras import Input
from keras import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Conv1D, MaxPooling1D
from keras.layers import Bidirectional, BatchNormalization, GlobalAveragePooling1D, concatenate
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, auc
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum, strip_short, strip_numeric
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer

In [None]:
df = pd.read_csv('reduced_df.csv')

In [None]:
def get_auc(y_true,y_pred):

    y_pred = LabelBinarizer().fit_transform(y_pred.argmax(axis=1))
    return roc_auc_score(y_true, y_pred, average='weighted')

In [None]:
article_text = df['article_text'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

text = article_text.apply(lambda x: strip_short(remove_stopwords(strip_numeric(
                            strip_non_alphanum(x.lower()))), minsize=3))

title = df.article_title.apply(lambda x: strip_short(remove_stopwords(strip_numeric(
                            strip_non_alphanum(x.lower()))), minsize=3))

mid_5th = text.apply(lambda x: x[round(0.4*len(x)):round(0.6*len(x))])

In [None]:
input_type = mid_5th
max_features = 15000

tokenize = Tokenizer(num_words=max_features)
tokenize.fit_on_texts(input_type)
sequences = tokenize.texts_to_sequences(input_type)
max_len = round(float(np.median([len(x) for x in sequences])))
input_text = pad_sequences(sequences, maxlen=max_len)
labels = to_categorical(df.norm_score, num_classes=3)

print(input_text.shape)
print(labels.shape)

print("Median squence length: {}".format(np.median([len(x) for x in sequences])))
print("Sequence stdev: {}".format(np.std([len(x) for x in sequences])))

## GLOVE

In [None]:
embeddings_index = {}

with open('glove.6B/glove.6B.100d.txt') as f:  
    for line in f:
        line = line.split()
        word = line[0]
        embeddings_index[word] = np.asarray(line[1:], dtype='float')

w_index = tokenize.word_index
embedding_dim = 100
w_matrix = np.zeros((max_features,embedding_dim))

for word, i in w_index.items():
    if i < max_features:
        glove_vector = embeddings_index.get(word)
        if glove_vector is not None:
            w_matrix[i] = glove_vector

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_text, labels, test_size=0.1, random_state=1)
X_train_rs, y_train_rs = ADASYN(n_jobs=4, random_state=42).fit_resample(X_train, y_train)

In [None]:
print(X_train_rs.shape)
print(y_train_rs.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
labels.sum(axis=0)

In [None]:
y_train_rs.sum(axis=0)

In [None]:
y_test.sum(axis=0)

In [None]:
idx = np.random.permutation(X_train_rs.shape[0])

X_train = X_train_rs[idx]
y_train = y_train_rs[idx]

In [None]:
idx = np.random.permutation(X_test.shape[0])

X_test = X_test[idx]
y_test = y_test[idx]

## REGULAR LSTM

In [None]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=max_len))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(3, activation='softmax'))

model.layers[0].set_weights([w_matrix])
model.layers[0].trainable = False

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

plt.plot(model.history.history['acc'], color='orange', label="Train acc.")
plt.plot(model.history.history['val_acc'], color='lightblue', label="Val acc.")
plt.plot(model.history.history['loss'], '--', color='orange', label="Train loss")
plt.plot(model.history.history['val_loss'], '--', color="lightblue", label="Val loss")
plt.legend(loc="upper left")
plt.title('Train/Test Loss and Accuracy (LSTM)')

In [None]:
model.evaluate(X_test, y_test, verbose=0)[1]

In [None]:
lstm_pred = model.predict(X_test)
print("AUC: {}".format(get_auc(y_test,lstm_pred)))

confusion_matrix(y_test.argmax(axis=1),model.predict(X_test).argmax(axis=1))

## LSTM + CNN

In [None]:
model_lstm_cnn = Sequential()
model_lstm_cnn.add(Embedding(max_features, 100, input_length=max_len))
model_lstm_cnn.add(Conv1D(filters=64,kernel_size=3,padding='same',activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
model_lstm_cnn.add(Dense(3, activation='softmax'))

model_lstm_cnn.layers[0].set_weights([w_matrix])
model_lstm_cnn.layers[0].trainable = False

model_lstm_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model_lstm_cnn.summary()

model_lstm_cnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

plt.plot(model_lstm_cnn.history.history['acc'], color='orange', label="Train acc.")
plt.plot(model_lstm_cnn.history.history['val_acc'], color='lightblue', label="Val acc.")
plt.plot(model_lstm_cnn.history.history['loss'], '--', color='orange', label="Train loss")
plt.plot(model_lstm_cnn.history.history['val_loss'], '--', color="lightblue", label="Val loss")
plt.legend(loc="upper left")

In [None]:
model_lstm_cnn.evaluate(X_test, y_test, verbose=0)[1]

In [None]:
lstm_cnn_pred = model_lstm_cnn.predict(X_test)
print("AUC: {}".format(get_auc(y_test,lstm_cnn_pred)))

confusion_matrix(y_test.argmax(axis=1),model_lstm_cnn.predict(X_test).argmax(axis=1))

## JUST CNN

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(max_features, 100, input_length=max_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(BatchNormalization())
model_cnn.add(MaxPooling1D(2))
model_cnn.add(Conv1D(64, 5, activation='relu'))
model_cnn.add(BatchNormalization())
model_cnn.add(MaxPooling1D(2))
model_cnn.add(Conv1D(64, 5, activation='relu'))
model_cnn.add(GlobalAveragePooling1D())
model_cnn.add(Dense(3, activation='softmax'))


model_cnn.layers[0].set_weights([w_matrix])
model_cnn.layers[0].trainable = False

model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model_cnn.summary()

model_cnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

plt.plot(model_cnn.history.history['acc'], color='orange', label="Train acc.")
plt.plot(model_cnn.history.history['val_acc'], color='lightblue', label="Val acc.")
plt.plot(model_cnn.history.history['loss'], '--', color='orange', label="Train loss")
plt.plot(model_cnn.history.history['val_loss'], '--', color="lightblue", label="Val loss")
plt.legend(loc="upper left")
plt.title('Train/Val Loss and Accuracy (CNN)')

In [None]:
model_cnn.evaluate(X_test, y_test, verbose=0)[1]

In [None]:
print("AUC: {}".format(get_auc(y_test,model_cnn.predict(X_test))))
print("Test Accuracy: {}".format(model_cnn.evaluate(X_test, y_test, verbose=0)[1]))

confusion_matrix(y_test.argmax(axis=1),model_cnn.predict(X_test).argmax(axis=1))

## FUNCTIONAL API

In [None]:
df_images = pd.read_csv('text_and_images.csv')
len(df_images)

In [None]:
####### TEXT 
max_features = 2000

article_text = df_images['article_text'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
text = article_text.apply(lambda x: strip_short(remove_stopwords(strip_numeric(
                            strip_non_alphanum(x.lower()))), minsize=3))

tokenize = Tokenizer(num_words=max_features)
tokenize.fit_on_texts(text)
sequences = tokenize.texts_to_sequences(text)
max_len = round(float(np.median([len(x) for x in sequences])))
input_text = pad_sequences(sequences, maxlen=max_len)
y_binary = to_categorical(df_images.norm_score, num_classes=3)

####### IMAGES
images_data = df_images.drop(['article_title', 'article_text', 'url', 'norm_score'], axis=1)

print(input_text.shape)
print(y_binary.shape)
print("Mean squence length: {}".format(np.median([len(x) for x in sequences])))
print("Sequence stdev: {}".format(np.std([len(x) for x in sequences])))

In [None]:
w_index = tokenize.word_index
embedding_dim = 100
w_matrix = np.zeros((max_features,embedding_dim))

for word, i in w_index.items():
    if i < max_features:
        glove_vector = embeddings_index.get(word)
        if glove_vector is not None:
            w_matrix[i] = glove_vector

In [None]:
text_df = pd.DataFrame(input_text)
text_idx = text_df.shape[1]
combined = images_data.join(text_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(combined, y_binary, test_size=0.1, random_state=42)
X_train_rs, y_train_rs = ADASYN(n_jobs=4, random_state=42).fit_resample(X_train, y_train)

idx = np.random.permutation(X_train_rs.shape[0])

X_train = X_train_rs[idx]
y_train = y_train_rs[idx]

text_features_train = X_train[:,-text_idx:]
image_features_train = X_train[:,:-text_idx]

text_features_test = X_test[list(text_df.columns)]
image_features_test = X_test.drop(list(text_df.columns), axis=1)

print(y_train_rs.sum(axis=0))
print(y_test.sum(axis=0))

### Just Text First (reusing from above)

In [None]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=max_len))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(3, activation='softmax'))

model.layers[0].set_weights([w_matrix])
model.layers[0].trainable = False

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

model.fit(text_features_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

plt.plot(model.history.history['acc'], color='orange', label="Train acc.")
plt.plot(model.history.history['val_acc'], color='lightblue', label="Val acc.")
plt.plot(model.history.history['loss'], '--', color='orange', label="Train loss")
plt.plot(model.history.history['val_loss'], '--', color="lightblue", label="Val loss")
plt.legend(loc="upper left")
plt.title('Train/Test Loss and Accuracy (LSTM)')

In [None]:
lstm_pred = model.predict(text_features_test)
print(model.evaluate(text_features_test, y_test, verbose=0)[1])
print("AUC: {}".format(get_auc(y_test,lstm_pred)))

confusion_matrix(y_test.argmax(axis=1),model.predict(text_features_test).argmax(axis=1))

In [None]:
text_input = Input(shape=(None,),dtype='int32',name='text')
embedding = Embedding(max_features, 100, input_length=max_len)(text_input)
lstm = LSTM(100, dropout=0.1, recurrent_dropout=0.1)(embedding)

image_input = Input(shape=(images_data.shape[1],), dtype='float32', name='images')
image_network = Dense(128,input_shape=(len(df),), activation='relu')(image_input)
image_network = Dropout(0.2)(image_network)
image_network = Dense(64,input_shape=(len(df),), activation='relu')(image_network)

concatenated = concatenate([lstm, image_network],axis=-1)
result = Dense(3, activation='softmax')(concatenated)

model_images = Model([text_input,image_input], result)
model_images.layers[3].set_weights = w_matrix
model_images.layers[3].trainable = False

model_images.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
model_images.summary()

In [None]:
model_images.fit([text_features_train, image_features_train], y_train, epochs=5, 
                 batch_size=32, validation_split=0.1)

In [None]:
print(model_images.evaluate([text_features_test,image_features_test], y_test, verbose=0)[1])
print(confusion_matrix(y_test.argmax(axis=1),model_images.predict([text_features_test, 
                                                                   image_features_test]).argmax(axis=1)))

print(classification_report(y_test.argmax(axis=1),model_images.predict([text_features_test, 
                                                                   image_features_test]).argmax(axis=1)))

combined_pred = model_images.predict([text_features_test, image_features_test])
print("AUC: {}".format(get_auc(y_test,combined_pred)))

In [None]:
plt.plot(model_images.history.history['acc'], color='orange', label="Train acc.")
plt.plot(model_images.history.history['val_acc'], color='lightblue', label="Val acc.")
plt.plot(model_images.history.history['loss'], '--', color='orange', label="Train loss")
plt.plot(model_images.history.history['val_loss'], '--', color="lightblue", label="Val loss")
plt.legend(loc="upper left")
plt.title('Train/Val Loss and Accuracy (Text + Image Features)')