In [None]:
#### GPU Check ####
 
!nvidia-smi -L

In [34]:
#### Setup Kaggle ####

from google.colab import drive
drive.mount('/content/gdrive')

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Kaggle"
%cd '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/COMPSCI703 - PG/Colab


In [None]:
#### Downloading Files ####

%cd '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/Kaggle/'


# ! kaggle datasets download -d iezepov/gensim-embeddings-dataset

%cd '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/'


# ! wget 'https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/hs_data/en_dataset_with_stop_words.csv'
# ! wget 'https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/master/ethos/ethos_data/Ethos_Dataset_Multi_Label.csv'
# ! wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'


%cd '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/'

# Clean-up if requred.

# rm -rf


In [None]:
#### Unzipping Files #####

%cd '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/'

! unzip \*.zip  && rm *.zip

%cd '/content/gdrive/My Drive/COMPSCI703 - PG/Colab/'

In [48]:
##### TODO
## 1 - FIX Glove embedding UTF-8 errors
## 2 - FIX Output prediction
## 3 - FIX Output prediciton .CSV
## 4 - FIX Unknown Erorror with Modelling leading to averaged predictions arross all IDs
  

##### IMPORT BIN #####

import gc
import pickle
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from keras.losses import binary_crossentropy
from keras import backend as K

######################


##### PRE-TRAINED INPUTS #####
EMBEDDING_FILES = [
    '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/Fasttext/crawl-300d-2M.vec',
    # '/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/Kaggle/gensim-embeddings/glove.840B.300d.gensim.vectors.npy'
]
######################


##### Model Settings #####
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
######################


##### DEF Party #####
# Gets the Coefficients for given word returning a "flat32"
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


# Loads embeddings from array list in form of a "Dict"
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

# Builds a matrix of the embedding index returning "embedding_matrix" var skipping when flagged with Key Errors
def matrix_builder(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix


# Easy easily set and return Loss rate for the cross entropy in form of "binary_crossentropy"
def loss_rate(y_true, y_pred):
    return binary_crossentropy(K.reshape(y_true[:, 0], (-1, 1)), y_pred) * y_true[:, 1]

# Bob's Building model func, returning the "model"
def build_model(embedding_matrix, num_alt_targets, loss_weight):

    # Words fed in form of of shapes -- then imposing bidirectional & spatial dropouts
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape,
                  weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    # NN background mapping
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add(
        [hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add(
        [hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    
    # Result of sigmoid operation
    result = Dense(1, activation='sigmoid')(hidden)
    # alt_result resevered for multiple files of search (as seen in ethos_multilabel)
    alt_result = Dense(num_alt_targets, activation='sigmoid')(hidden)
    # model formed by Adam optimization algorithm -- as discribed in arXiv:1412.6980 [cs.LG]
    model = Model(inputs=words, outputs=[result, alt_result])
    model.compile(loss=[loss_rate, 'binary_crossentropy'],
                  loss_weights=[loss_weight, 1.0], optimizer='adam')

    return model

# Pre-pocess used to sanatize data for use in model. 
def preprocess(data):
    ## Credit for prepocess goes to Gabriel Preda @ https://www.kaggle.com/gpreda via https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + \
        '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data


def testing(predictions):
    print(predictions)
    
######################


##### INPUT DATASET #####
ethos_dataset = pd.read_csv("/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/ETHOS/Ethos_Dataset_Binary.csv", sep=";", dtype={'comment':'str','isHate':'float'})

train = ethos_dataset
test = ethos_dataset['comment']

training_data = preprocess(train['comment'])
identity_columns = ['isHate']
#########################


##### Weights Settigns #####
# Overall weights
weights = np.ones((len(training_data),)) / 4
# Subgroup
weights += (train[identity_columns].fillna(0).values >=
            0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (((train['isHate'].values >= 0.5).astype(bool).astype(np.int) +
             (train[identity_columns].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (((train['isHate'].values < 0.5).astype(bool).astype(np.int) +
             (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()
#########################


##### TESTINGG #####
main_testing_data = np.vstack(
    [(train['isHate'].values >= 0.5).astype(np.int), weights]).T
alt_testing_data = train[['isHate']].values
testing_data = preprocess(test)

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(training_data) + list(testing_data))


training_data = tokenizer.texts_to_sequences(training_data)
testing_data = tokenizer.texts_to_sequences(testing_data)
training_data = sequence.pad_sequences(training_data, maxlen=MAX_LEN)
testing_data = sequence.pad_sequences(testing_data, maxlen=MAX_LEN)
#########################

embedding_matrix = np.concatenate(
    [matrix_builder(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)



#### MODELING WORK ####
# Use of Pickle to dumb memory to file -- reducing memory usage with large datasets.
# Model based on that found by Tanrei(nama) @ https://www.kaggle.com/tanreinama/ via https://www.kaggle.com/tanreinama/simple-lstm-using-identity-parameters-solution/
#  
with open('tmp.pickle', mode='wb') as f:
    pickle.dump(testing_data, f)  # use tmp file to reduce memory

del identity_columns, weights, tokenizer, train, test, testing_data
gc.collect()


checkpoint_predictions = []
weights = []

for model_idx in range(NUM_MODELS):
    model = build_model(
        embedding_matrix, alt_testing_data.shape[-1], loss_weight)
    for global_epoch in range(EPOCHS):
        model.fit(
            training_data,
            [main_testing_data, alt_testing_data],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=1,
            callbacks=[
                LearningRateScheduler(
                    lambda epoch: 1e-3 * (0.6 ** global_epoch))
            ]
        )
        with open('tmp.pickle', mode='rb') as f:
            testing_data = pickle.load(f)  # use tmp file to reduce memory
        checkpoint_predictions.append(model.predict(
            testing_data, batch_size=2048)[0].flatten())
        del testing_data
        gc.collect()
        weights.append(2 ** global_epoch)
    del model
    gc.collect()
    
############################


predictions = np.average(checkpoint_predictions, weights=weights, axis=0)


## Re_initi dataset for printing and writing 
ethos_dataset2 = pd.read_csv("/content/gdrive/MyDrive/COMPSCI703 - PG/Colab/Datasets/ETHOS/Ethos_Dataset_Binary.csv",
                             sep=";", dtype={'comment': 'str', 'isHate': 'float'})
testset = ethos_dataset2['comment']


working_final = pd.DataFrame.from_dict({
    'id': str(""),
    'prediction': predictions
})

for x in testset:
    final = pd.DataFrame.from_dict({
        'id': x,
        'prediction': predictions
    })
    working_final.append(final)

testing(predictions)

final.to_csv('prediction.csv', index=False)

# Gedaan

[0.5493085  0.56906547 0.65150362 0.55820562 0.5648252  0.64933557
 0.65688896 0.65205853 0.58114767 0.67706198 0.68913924 0.63175967
 0.63509474 0.62590588 0.5885163  0.71767389 0.70449015 0.62978065
 0.6920278  0.61460235 0.80613761 0.76422913 0.68905533 0.53099745
 0.65014579 0.60607123 0.61637787 0.6126469  0.55099254 0.63091412
 0.71317287 0.56832383 0.64962908 0.59947742 0.527946   0.68464074
 0.66781985 0.63647955 0.61131734 0.69991647 0.62962679 0.6662313
 0.70274416 0.8333305  0.75393456 0.61134636 0.57187253 0.64203812
 0.73946496 0.61010085 0.59993409 0.60778453 0.64224785 0.68172192
 0.61011011 0.56986658 0.70862737 0.67010986 0.51561591 0.58736236
 0.71637002 0.60347387 0.72936154 0.77571994 0.60405012 0.70883035
 0.69134007 0.58963975 0.64441389 0.61895745 0.61701154 0.58375188
 0.5877177  0.56750155 0.68033997 0.69395009 0.59538788 0.58759862
 0.59560099 0.58698346 0.74652044 0.59950228 0.58796513 0.5843694
 0.65583253 0.70355455 0.63563458 0.74990367 0.68788028 0.569217