## Install giphy_client Library

In [1]:
import sys
!{sys.executable} -m pip install giphy_client



## Process Input Tweet

In [2]:
from difflib import SequenceMatcher
import preprocessor as p
import string
import spacy
import csv
import sys
import os

In [3]:
def find_similar(word, dictionary):
    cur_max = 0
    cur_word = ''
    for w in dictionary:                                                                                                                                                                                                                                                                                                                                                                                                          
        score = SequenceMatcher(None, w, word).ratio() * 100
        if score > cur_max:
            cur_max = score
            cur_word = w
    return cur_word

In [4]:
def setup_dictionary():
    file = open('data_preprocessing/raw_data/all_words.txt', 'r')
    lines = file.readlines()
    
    index = 0
    dictionary = {}
    for w in lines:
        dictionary[w.strip('\n')] = index
        index += 1

    file.close()
    return dictionary

In [5]:
def preprocess_tweet(tweet, filename):
    dictionary = setup_dictionary()
    
    # preprocessor
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    nlp = spacy.load("en_core_web_sm")
    
    cleaned = p.clean(tweet)
    if len(cleaned) > 0:
        if cleaned[0] == ':':
            cleaned = cleaned[1:]
    
    dependency_tagged = nlp(cleaned)
    
    with open(filename, "a") as file:
        writer = csv.writer(file)
        for token in dependency_tagged:

            if token.lemma_ != '-PRON-' and token.pos_ != 'SPACE' and token.text not in string.punctuation and token.text.isdigit() == False and token.pos_ != 'PUNCT' and token.pos_ != 'NUM' and token.pos_ != 'X':
                lower_case = token.lemma_.lower()
                word_index = 0

                if lower_case in dictionary:
                    word_index = dictionary[lower_case]

                else:
                    # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.is_stop)
                    lower_case = find_similar(lower_case, dictionary)
                    if lower_case == '':
#                         print('############# unfound ############')
                        continue
                    word_index = dictionary[lower_case]
#                     print(lower_case, word_index)

#                 if lower_case not in twitter_dictionary:
#                     twitter_dictionary[lower_case] = word_index
#                     r = [lower_case, word_index]
#                     dic_writer.writerow(r)

                row = [token.text, token.lemma_, lower_case, word_index, token.pos_, token.dep_, token.is_stop]

                writer.writerow(row)

## Predict Keywords using Model

In [6]:
import csv
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Activation, Flatten, Input, Concatenate
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa 
import matplotlib.pyplot as plt

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [7]:
threshold = 0.01
hidden_size = 100
word_embedding_dim = 300
epochs = 10
use_unlabeled_dataset = True

labeled_dataset_size = 1830
train_dataset_size = 900
validation_dataset_size = 100
test_dataset_size = 830
unlabeled_dataset_size = 4000

pos_list = np.char.lower(["ADJ","ADP","ADV","AUX","CONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","X"])
dep_list = np.char.lower(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"])

pos_dim = len(pos_list)
dep_dim = len(dep_list)

In [8]:
# Helper function to one-hot encode the labels
def one_hot(vec, dic):
    vec = np.char.lower(vec)
    return np.array([dic == row for row in vec], dtype='i1')

In [9]:
def reshape_input(filename):
    with open(filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        tweet = [tuple(x) for x in spamreader]
        tweet = np.array(tweet, dtype=([("text", 'U20'),("simplified_text", 'U20'), ("best_match", 'U20'), ("index", int), ("pos", 'U20'), ("dep", 'U20'), ("stop", 'U5')]))
        
    text = tf.reshape(tweet["index"], (1, -1, 1))
    pos = tf.reshape(one_hot(tweet["pos"], pos_list), (1, -1, pos_dim))
    dep = tf.reshape(one_hot(tweet["dep"], dep_list), (1, -1, dep_dim))
    data = np.concatenate((text, pos, dep), axis=-1)
    return data, tweet

In [10]:
inputs = Input(shape=(None, pos_dim+dep_dim+1))
x = Embedding(380000, word_embedding_dim)(inputs[:,:,0])
x = Concatenate(axis=-1)([inputs[:,:,1:], x])
x = Bidirectional(LSTM(100, return_sequences=True))(x)
outputs = Dense(2, activation=tf.nn.sigmoid)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
# model.summary()

opt = tf.keras.optimizers.Adam(
    learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False
)
model.compile(loss=BinaryCrossentropy(), optimizer=opt)
model.load_weights('./checkpoint/checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f70657384c0>

In [11]:
def make_prediction(model, data, tweet):
#     print("tweet", tweet)
    prediction = model.predict(data)
    keywords = set()
#     print("prediction", prediction)
    
    for i in range(prediction[0].shape[0]):
        if (prediction[0][i][1] > 0.32):
            keywords.add(tweet[i][0])
    
    return keywords

## Fetch GIFs using GIPHY API

In [12]:
import time
import giphy_client
from giphy_client.rest import ApiException
from pprint import pprint
from IPython.display import HTML

def generate_gif(keywords):
    # create an instance of the API class
    api_instance = giphy_client.DefaultApi()
    api_key = 'dc6zaTOxFJmzC' # str | Giphy API Key.
    limit = 2 # int | The maximum number of records to return. (optional) (default to 25)
    offset = 0 # int | An optional results offset. Defaults to 0. (optional) (default to 0)
    rating = 'g' # str | Filters results by specified rating. (optional)
    lang = 'en' # str | Specify default country for regional content; use a 2-letter ISO 639-1 country code. See list of supported languages <a href = \"../language-support\">here</a>. (optional)
    fmt = 'json' # str | Used to indicate the expected response format. Default is Json. (optional) (default to json)

    gif = ""

    for k in keywords:
        q = k # str | Search query term or prhase.

        # Search Endpoint
        api_response = api_instance.gifs_search_get(api_key, q, limit=limit, offset=offset, rating=rating, lang=lang, fmt=fmt)
        urls = []
        for i in range(len(api_response.data)):
            urls.append(api_response.data[i].images.downsized.url)

        for i in range(len(urls)):
            gif += "<img src='" + urls[i] + "'>"
    
    return gif

In [15]:
tweet = input("Enter your tweet: ")
preprocess_tweet(tweet, "user_input.csv")
print("The entered tweet is:", tweet, "\n")

data, tweet = reshape_input("user_input.csv")

keywords = make_prediction(model, data, tweet)
print("The predicted keywords are", keywords, "\n")

gif = generate_gif(keywords)
print("Recommended GIFs are ... ")
HTML(gif)

Enter your tweet: I love hot pot
The entered tweet is: I love hot pot 

The predicted keywords are {'pot'} 

Recommended GIFs are ... 


In [14]:
os.remove("user_input.csv")