In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Common data handling libraries
import os
import string
import pandas as pd
import numpy as np
import glob
import pickle

# Gensim for LDA
import gensim

# NLTK for test processing
import nltk
nltk.download('stopwords')

# spacy for Lemmatization
import spacy

# Visualization
import matplotlib.pyplot as plt

# Deep learning modeling
import keras

# Model evaluation metrics
import sklearn

In [None]:
import pandas as pd
img_cap_df2 = pd.read_csv('tweetsTRAIN.txt', delimiter = "\t")

In [None]:
img_cap_df2['caption'] = img_cap_df2['tweetText']


In [None]:
from PIL import Image
import os
from pathlib import Path
path = "TweetsTRAINImages/"

lab = []
imgg = []
ca = []
cap = img_cap_df2['caption'].to_list()
im = img_cap_df2['imageId(s)'].to_list()
labels = img_cap_df2['label'].to_list()
for i in range(len(img_cap_df2['tweetId'].to_list())):
  try:
    filepath = 'TweetsTRAINImages/'+ im[i]+'.jpg'
    img = Image.open(filepath).resize((128,128), Image.BICUBIC).convert('RGB')
    ca.append(cap[i])
    lab.append(labels[i])
    imgg.append(filepath)
  except:
    print("Image at filepath {0} does not exist".format(filepath))



In [None]:
img_cap_df = pd.DataFrame({'image_id': imgg,
                   'caption': ca,
                   'label': lab


                   })

In [None]:
def clean_text(data):

    # convert to lower case
    data = [word.lower() for word in data.split()]

    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    # remove punctuation from each word
    data = [word.translate(table) for word in data]

    # remove tokens with numbers in them
    data = [word for word in data if word.isalpha()]

    # remove stopwords
    data = [word for word in data if word not in nltk.corpus.stopwords.words('english')]

    return data


data_caption = list(img_cap_df['caption'].apply(lambda x : clean_text(x)))
data_caption[:5]

In [None]:
!pip install spacy

In [None]:
!python -m spacy validate

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# lemmatize the words
#nlp = spacy.load(r"c:\users\jsaikumar\anaconda3\envs\tensorflow\lib\site-packages\en_core_web_sm\en_core_web_sm-2.2.0", disable=['parser', 'ner'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
data_caption_lemmatized = [[word.lemma_ for word in nlp(str(' '.join(doc))) if word.pos_ in allowed_postags]
                           for doc in data_caption]
data_caption_lemmatized[:5]

In [None]:
img_cap_df['caption_lemmatized'] = data_caption_lemmatized
img_cap_df.head()

In [None]:
# Train, valid, test split of dataset
train_df = img_cap_df[:6000]
valid_df = img_cap_df[6000:7000]
test_df = img_cap_df[7000:8000]
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

In [None]:
# Create Dictionary
id2word = gensim.corpora.Dictionary(train_df['caption_lemmatized'])

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in train_df['caption_lemmatized']]

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

In [None]:
# Find the optimal number of topics
START = 10
LIMIT = 100
STEP = 5
topic_range = range(START, LIMIT, STEP)

coherence_values = []
model_list = []
for num_topics in topic_range:
    model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
    model_list.append(model)
    coherencemodel = gensim.models.coherencemodel.CoherenceModel(model=model, texts=train_df['caption_lemmatized'],
                                                                 dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

max_coherence_val = 0
optimal_model = None

# Print the coherence scores
for i, (m, cv) in enumerate(zip(topic_range, coherence_values)):
    if max_coherence_val < round(cv, 4):
        optimal_model = model_list[i]
        optimal_num_topics = m
        max_coherence_val = round(cv, 4)

    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

# plot coherence results
plt.plot(coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the Keyword in the 10 topics
for topic in optimal_model.print_topics():
    print(topic)

doc_lda = optimal_model[corpus]

In [None]:
 optimal_num_topics =  95

In [None]:
print('Optimal Number of Topics :', optimal_num_topics)

# Compute Perplexity
print('\nPerplexity: ', optimal_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=optimal_model, texts=data_caption_lemmatized,
                                                                  dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def predictTopics(corpus):

    caption_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(optimal_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = optimal_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                caption_topics_df = caption_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]),
                                                             ignore_index=True)
            else:
                break

    caption_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return caption_topics_df

df = predictTopics([id2word.doc2bow(text) for text in train_df['caption_lemmatized']])
train_df = pd.concat([train_df.reset_index(drop=True), df], axis=1)

df = predictTopics([id2word.doc2bow(text) for text in valid_df['caption_lemmatized']])
valid_df = pd.concat([valid_df.reset_index(drop=True), df], axis=1)

df = predictTopics([id2word.doc2bow(text) for text in test_df['caption_lemmatized']])
test_df = pd.concat([test_df.reset_index(drop=True), df], axis=1)

In [None]:
# save the data object files
pickle.dump(train_df, open('./data/train_df.pkl','wb'))
pickle.dump(valid_df, open('./data/valid_df.pkl','wb'))
pickle.dump(test_df, open('./data/test_df.pkl','wb'))

In [None]:
# Load the data object files
optimal_num_topics =
train_df = pickle.load(open('./data/train_df.pkl', 'rb'))
valid_df = pickle.load(open('./data/valid_df.pkl', 'rb'))
test_df = pickle.load(open('./data/test_df.pkl', 'rb'))

In [None]:
import tensorflow as tf

In [None]:
optimal_num_topics

In [None]:
# Create model
vgg16_model = tf.keras.applications.VGG16(weights='imagenet', include_top=True, input_shape=(224,224,3))

# pop the last softmax layer
vgg16_model.layers.pop()

# freezing the remaining layers
for layer in vgg16_model.layers:
    layer.trainable = False

output_model = keras.layers.Dense(2056, activation='tanh')(vgg16_model.layers[-1].output)
output_model = keras.layers.Dropout(0.5)(output_model)
output_model = keras.layers.Dense(1024, activation='tanh')(output_model)
output_model = keras.layers.Dropout(0.5)(output_model)
output_model = keras.layers.Dense(optimal_num_topics, activation='softmax')(output_model)

vgg16_model = tf.keras.Model(vgg16_model.input,output_model)

vgg16_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])
vgg16_model.summary()

In [None]:
import pathlib

class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, images_paths, labels, image_dimensions=(224, 224, 3), batch_size=64, shuffle=False):
        self.labels       = labels              # array of labels
        self.images_paths = images_paths        # array of image paths
        self.image_dim = image_dimensions
        self.batch_size   = batch_size          # batch size
        self.shuffle      = shuffle             # shuffle bool

        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.images_paths) / self.batch_size))

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.images_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __getitem__(self, index):
        'Generate one batch of data'
        # selects indices of data for next batch
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]

        # select data and load images
        labels = np.array([self.labels[k] for k in indexes])

        images = np.array([self.preprocessImageForVGG16(self.images_paths[k]) for k in indexes ])

        return images, labels


    #customize function used for color convetion
    def preprocessImageForVGG16(self, filename):
        # load image
        image = keras.preprocessing.image.load_img(filename, target_size=(self.image_dim[0], self.image_dim[1]))
        # convert the image pixels to a numpy array
        image = keras.preprocessing.image.img_to_array(image)
        # prepare the image for the VGG model
        image = keras.applications.vgg16.preprocess_input(image)

        return image

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
# Train the model
# reduces learning rate if no improvement are seen
learning_rate_reduction = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                            patience=2,
                                            verbose=1,
                                            factor=0.5,
                                            min_lr=0.0000001)

# stop training if no improvements are seen
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss",
                           mode="min",
                           patience=10)

# saves model weights to file
checkpoint = keras.callbacks.ModelCheckpoint('topic_predictor_model.hdf5',
                             monitor='val_loss',
                             verbose=1,
                             save_best_only=True,
                             mode='min',
                             save_weights_only=True)


X_train = train_df['image_id'].values
Y_train = train_df['Dominant_Topic'].values
Y_train = tf.keras.utils.to_categorical(Y_train, num_classes=optimal_num_topics)

X_valid = valid_df['image_id'].values
Y_valid = valid_df['Dominant_Topic'].values
Y_valid = tf.keras.utils.to_categorical(Y_valid, num_classes=optimal_num_topics)


# prepare data generator
train_data = DataGenerator(X_train, Y_train, batch_size=50, shuffle=True)
valid_data = DataGenerator(X_valid, Y_valid, batch_size=50, shuffle=False)

# train on data
history = vgg16_model.fit_generator(generator=train_data,
                                   validation_data=valid_data,
                                   epochs=20,
                                   steps_per_epoch=len(train_data),
                                   validation_steps =len(valid_data),
                                   callbacks=[learning_rate_reduction, early_stop, checkpoint],
                                   verbose=2,
                                   )

# plot training history
fig, ax = plt.subplots(2, 1, figsize=(6, 6))
ax[0].plot(history.history['loss'], label="TrainLoss")
ax[0].plot(history.history['val_loss'], label="ValLoss")
ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['acc'], label="TrainAcc")
ax[1].plot(history.history['val_acc'], label="ValAcc")
ax[1].legend(loc='best', shadow=True)
plt.show()

In [None]:
vgg16_model.load_weights('topic_predictor_model.hdf5')

X_test = test_df['image_id'].values
Y_test = test_df['Dominant_Topic'].values
Y_test = tf.keras.utils.to_categorical(Y_test, num_classes=optimal_num_topics)

# prepare data generator
test_data = DataGenerator(X_test, Y_test, batch_size=1, shuffle=False)

# predict on data
pred_caption_topics_prob = vgg16_model.predict_generator(test_data)
pred_caption_topics = np.argmax(pred_caption_topics_prob, axis=1)

results_df = pd.DataFrame({ 'image_id':X_test, 'pred_topics':pred_caption_topics })
results_df.head()

In [None]:
# Evaluation score
log_score = sklearn.metrics.log_loss(test_df['Dominant_Topic'].values, pred_caption_topics_prob)
print('Log-loss score :', log_score)

acc_score = sklearn.metrics.accuracy_score(test_df['Dominant_Topic'].values, pred_caption_topics)
print('Accuracy score :', acc_score)

print('Confusion matrix :')
print(sklearn.metrics.confusion_matrix(test_df['Dominant_Topic'].values, pred_caption_topics))

In [None]:
sameTopic = 0
diffTopic = 0
sameTopicFake = 0
sameTopicReal = 0
diffTopicFake = 0
diffTopicReal = 0

textTopics = test_df['Dominant_Topic'].to_list()
imageTopics = results_df['pred_topics'].to_list()
labels = test_df['label'].to_list()


for i in range(len(textTopics)):
  if textTopics[i] == imageTopics[i]:
    sameTopic += 1
    if labels[i] == 'fake':
      sameTopicFake += 1
    else:
      sameTopicReal += 1
  else:
    diffTopic += 1
    if labels[i] == 'fake':
      diffTopicFake += 1
    else:
      diffTopicReal += 1


