In [16]:
# import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.python.keras import preprocessing
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [2]:
# load dataframe meme_data and drop duplicates
df = pd.read_csv('memes_data.tsv', sep='\t')
df1 = df.drop_duplicates(subset=['HashId'])

In [3]:
# make dataframe as long as the amount of images downloaded locally
# total dataset is <200000, I have only downloaded 113914 memes
df1 = df1.loc[:114517] # the dataframe index goes up to 114517 from original dataframe since we dropped duplicates

In [4]:
# instantiate empty list to append all images into an numpy array
images_as_array = []

# for loop to preprocess images, changing all image sizes to 64x64 and converting to grayscale to reduce the amount of data
for i in range(len(df1)):
    try:
        image = tf.keras.preprocessing.image.load_img(f'D:/lh_final_data_images/img{str(i)}.jpg', color_mode='grayscale', target_size=(64, 64))
        input_arr = tf.keras.preprocessing.image.img_to_array(image)
        images_as_array.append(input_arr)
    except:
        images_as_array.append(np.nan) # this is to append nan values where images did not download correctly/are corrupted

In [5]:
# shows that all images were converted to numpy array
len(images_as_array)

113914

In [6]:
# checking shape of a single numpy array image
images_as_array[0].shape

(64, 64, 1)

In [7]:
# saving the numpy array of images to dataframe
df1['img_array'] = images_as_array

In [8]:
# checking for null values
df1.isnull().sum()

AltText           0
CaptionText       0
ImageURL          0
HashId            0
MemeLabel         0
img_array      1574
dtype: int64

In [9]:
# drop the 1574 null values we got from images that were corrupted
df1.dropna(inplace=True)

In [10]:
df1.isnull().sum()

AltText        0
CaptionText    0
ImageURL       0
HashId         0
MemeLabel      0
img_array      0
dtype: int64

In [11]:
# divide all numbers of our numpy array by 255 to turn scale our numpy array between 0 and 1
df1['img_array'] = df1['img_array'] / 255

In [12]:
# code to keep only alphanumeric characters in the labels column
df1['CaptionText'] = df1.CaptionText.str.replace('[^a-zA-Z0-9]', ' ')

  df1['CaptionText'] = df1.CaptionText.str.replace('[^a-zA-Z0-9]', ' ')


In [19]:
df1['CaptionText'] = df1['CaptionText'].str.strip()

In [31]:
# with max length=128, transform each row to a list of 128 integer values using tokenization
maxlen = 128
t = Tokenizer(char_level=True, lower=False)
t.fit_on_texts(df1['CaptionText'])
tokenized = t.texts_to_sequences(df1['CaptionText'])
padded_names = preprocessing.sequence.pad_sequences(tokenized, maxlen=maxlen, padding='post')
print(padded_names.shape) # print shape of text labels

(112340, 179)


In [32]:
print(t.word_index)

{' ': 1, 'E': 2, 'O': 3, 'T': 4, 'A': 5, 'I': 6, 'N': 7, 'S': 8, 'R': 9, 'e': 10, 'H': 11, 'L': 12, 'M': 13, 'D': 14, 'U': 15, 'o': 16, 'Y': 17, 't': 18, 'a': 19, 'G': 20, 'C': 21, 'n': 22, 'i': 23, 'W': 24, 's': 25, 'r': 26, 'P': 27, 'F': 28, 'B': 29, 'h': 30, 'l': 31, 'K': 32, 'u': 33, 'd': 34, 'm': 35, 'y': 36, 'V': 37, 'g': 38, 'c': 39, 'p': 40, 'w': 41, 'f': 42, 'b': 43, 'k': 44, 'v': 45, '0': 46, 'J': 47, '1': 48, 'X': 49, '2': 50, 'Z': 51, '9': 52, '5': 53, 'x': 54, '3': 55, 'Q': 56, 'j': 57, '4': 58, 'z': 59, '6': 60, '8': 61, '7': 62, 'q': 63}


In [52]:
padded_names[0]

array([35, 36,  1, 35, 16, 35,  1, 41, 30, 10, 22,  1, 23,  1, 40, 31, 19,
       36,  1, 41, 23, 18, 30,  1, 35, 36,  1, 40, 30, 16, 22, 10,  1, 42,
       16, 26,  1, 55, 46,  1, 35, 23, 22, 33, 18, 10, 25,  1, 35, 36,  1,
       35, 16, 35,  1, 41, 30, 10, 22,  1, 25, 30, 10,  1, 31, 16, 16, 44,
       25,  1, 19, 18,  1, 23, 22, 25, 18, 19, 38, 26, 19, 35,  1, 40, 30,
       16, 22, 10,  1, 42, 16, 26,  1, 18, 41, 16,  1, 30, 16, 33, 26, 25,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0])

In [36]:
len(padded_names)

112340

In [53]:
# save only label and input data into new dataframe
df2 = df1[['CaptionText', 'img_array']]

In [54]:
df2

Unnamed: 0,CaptionText,img_array
0,my mom when i play with my phone for 30 minute...,"[[[0.5882353], [0.59607846], [0.59607846], [0...."
1,Doing your own research for a test Copy and pa...,"[[[0.5803922], [0.5921569], [0.59607846], [0.6..."
2,1 000 000 followers on tik tok 1 point on imgflip,"[[[0.58431375], [0.5921569], [0.59607846], [0...."
3,Making original memes Following the idea of an...,"[[[0.5803922], [0.5921569], [0.59607846], [0.6..."
4,M,"[[[0.58431375], [0.5921569], [0.59607846], [0...."
...,...,...
114513,REALIZES SUCH A FAILURE I AM WITH ONLY 2 000 P...,"[[[0.36862746], [0.44313726], [0.3529412], [0...."
114514,HERE I HEARD YOU ARE FROM NEW HAMPSHIRE,"[[[0.36862746], [0.4509804], [0.3529412], [0.1..."
114515,THAT MOMENT WHEN YOUR LEAST FAVORITE CHARACTER...,"[[[0.36862746], [0.4509804], [0.3529412], [0.1..."
114516,I M GONNA FOLLOW JESUS EXAMPLE SHOW UP TO SO...,"[[[0.36862746], [0.4509804], [0.3254902], [0.1..."


In [55]:
# write label and input data dataframe into tsv file
df2.to_csv('train_test_memes.tsv', index=False, sep='\t')