# Imports

In [2]:
import pandas as pd
from google.colab import drive
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pickle

# Load Data

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file_path = '/content/drive/My Drive/hate_cleaned.csv'
data = pd.read_csv(file_path)
data


Unnamed: 0,comment,label,cleaned_comment,comment_length,comment_length_clean,cleaned_comment_string
0,dalits are lowlives,N,"['davit', 'lowlif']",19,2,davit lowlif
1,gay people are a burden to our society,N,"['gay', 'peopl', 'burden', 'societi']",38,4,gay peopl burden societi
2,Arabs are not welcome,N,"['arab', 'welcom']",21,2,arab welcom
3,I'm not saying we should actually eliminate he...,N,"['im', 'say', 'actual', 'elimin', 'herb', 'wis...",93,9,im say actual elimin herb wish natur becam ext...
4,bananas are for black people,N,"['banana', 'black', 'peopl']",28,3,banana black peopl
...,...,...,...,...,...,...
41139,Indian culture is so backwards,N,"['indian', 'cultur', 'backward']",30,3,indian cultur backward
41140,"Women don't want equality, they just want to b...",N,"['women', 'dont', 'want', 'equal', 'want', 'ch...",57,6,women dont want equal want charg
41141,fuck covid,P,"['fuck', 'covid']",10,2,fuck covid
41142,This computer is garbage,P,"['comput', 'garbag']",24,2,comput garbag


In [None]:
# convert data to strings
sentences = data['cleaned_comment_string'].astype(str)
sentences


In [6]:
# connect to GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print("Using GPU:", gpus[0])
    except RuntimeError as e:
        print(e)


Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [49]:

# Load ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

#  elmo embedding function
def elmo_embedding_batch(sentences):
    embeddings = elmo.signatures['default'](text=tf.constant(sentences))['elmo']
    return embeddings.numpy()

# generate embeddings for whole series in batches with batch_size
def generate_elmo_embeddings(series, batch_size=100):
    # set up embeddigns list
    all_embeddings = []

    # clean series input
    sentences = series.astype(str).tolist()

    # clean up the encoding
    sentences = [s.encode('utf-8').decode('utf-8') for s in sentences]
    sentences = [s.strip() for s in sentences if s.strip() != ""]

    # embed in batches
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        try:
            embeddings = elmo_embedding_batch(batch)

            #truncate or pad embeddings as needed
            if embeddings.shape[1] != 128:
                fixed_length = 128
                if embeddings.shape[1] > fixed_length:
                    embeddings = embeddings[:, :fixed_length, :]
                else:
                    pad_width = fixed_length - embeddings.shape[1]
                    embeddings = np.pad(embeddings,
                                        ((0, 0), (0, pad_width), (0, 0)),
                                        mode='constant')

            # append to the list of embeddings
            all_embeddings.append(embeddings)

        except Exception as e:
            print(f"error in batch: {i // batch_size + 1}: {e}")
            continue

    # concatenate all batches into a single numpy array
    if len(all_embeddings) > 0:
        elmo_embeddings_array = np.concatenate(all_embeddings, axis=0)
        return elmo_embeddings_array
    else:
        return None




In [51]:

def save_pickle(data, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

sentences = data['cleaned_comment_string']
BATCH_SIZE = 1000
for i in range(0, len(sentences), BATCH_SIZE):
    batch = sentences[i:i + BATCH_SIZE]
    elmo_embeddings = generate_elmo_embeddings(data['cleaned_comment_string'][i: i + BATCH_SIZE], batch_size=100)
    print(elmo_embeddings.shape)

    save_pickle(elmo_embeddings, f'/content/drive/My Drive/Data_Results_New/elmo_embeddings_{i}.pkl')



(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(1000, 128, 1024)
(144, 128, 1024)
