# PIP INSTALLS & IMPORTS NEEDED

In [None]:
# Install tensor flow
!pip install tensorflow-hub
!pip install tensorflow-text # Needed for loading universal-sentence-encoder-cmlm/multilingual-preprocess
!pip install tf-models-official
!pip install bert-for-tf2


Collecting tensorflow-text
  Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.3 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 57.5 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.1 tf-estimator-nightly-2.8.0.dev2021122109
Collecting tf-models-official
  Downloading tf_models_official-2.8.0-py2.py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 4.0 MB/s 
Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 10.6 MB/s 
[?25hCollecting tf-slim>=1.1.0
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 70.8 MB/s 
[?25hColl

## VARIOUS IMPORTS

In [None]:
#  Various other import statements
import bert
from official.nlp import optimization
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 

import tensorflow.keras as keras
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
import os
from keras.layers import Input, Lambda, Dense, Dropout
from tensorflow.keras.metrics import Metric

# early callback
from tensorflow.keras.callbacks import EarlyStopping



In [None]:
# Additional imports
import csv
import pandas as pd
import numpy as np
import json



# CUSTOM METRIC METHODS

## A CUSTOM CLASS FOR CALCULATING F1 SCORE

In [None]:
class StatefullBinaryFBeta(Metric):
  def __init__(self, name='state_full_binary_fbeta', beta=1, threshold=0.5, epsilon=1e-7, **kwargs):
    # initializing an object of the super class
    super(StatefullBinaryFBeta, self).__init__(name=name, **kwargs)

    # initializing state variables
    self.tp = self.add_weight(name='tp', initializer='zeros') # initializing true positives 
    self.actual_positive = self.add_weight(name='fp', initializer='zeros') # initializing actual positives
    self.predicted_positive = self.add_weight(name='fn', initializer='zeros') # initializing predicted positives

    # initializing other atrributes that wouldn't be changed for every object of this class
    self.beta_squared = beta**2 
    self.threshold = threshold
    self.epsilon = epsilon

  def update_state(self, ytrue, ypred, sample_weight=None):
    # casting ytrue and ypred as float dtype
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(ypred, tf.float32)

    # setting values of ypred greater than the set threshold to 1 while those lesser to 0
    ypred = tf.cast(tf.greater_equal(ypred, tf.constant(self.threshold)), tf.float32)
        
    self.tp.assign_add(tf.reduce_sum(ytrue*ypred)) # updating true positives atrribute
    self.predicted_positive.assign_add(tf.reduce_sum(ypred)) # updating predicted positive atrribute
    self.actual_positive.assign_add(tf.reduce_sum(ytrue)) # updating actual positive atrribute

  def result(self):
    self.precision = self.tp/(self.predicted_positive+self.epsilon) # calculates precision
    self.recall = self.tp/(self.actual_positive+self.epsilon) # calculates recall

    # calculating fbeta
    self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)
    
    return self.fb

  def reset_state(self):
    self.tp.assign(0) # resets true positives to zero
    self.predicted_positive.assign(0) # resets predicted positives to zero
    self.actual_positive.assign(0) # resets actual positives to zero

## CUSTOM CLASS FOR MACRO F1

In [None]:
class StatefullMultiClassFBeta(Metric):
  def __init__(self, name='state_full_binary_fbeta_macro', beta=1, n_class=1, average='macro', epsilon=1e-7, **kwargs):
    # initializing an object of the super class
    super(StatefullMultiClassFBeta, self).__init__(name=name, **kwargs)

    # initializing state variables
    self.tp = self.add_weight(name='tp', shape=(n_class,), initializer='zeros')     # initializing true positives
    self.actual_positives = self.add_weight(name='ap', shape=(n_class,), initializer='zeros') # initializing actual positives
    self.predicted_positives = self.add_weight(name='pp', shape=(n_class,), initializer='zeros') # initializing predicted positives

    # initializing other atrributes that wouldn't be changed for every object of this class
    self.beta_squared = beta**2
    self.n_class = n_class
    self.average = average
    self.epsilon = epsilon

  def update_state(self, ytrue, ypred, sample_weight=None):
    # casting ytrue and ypred as float dtype
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(ypred, tf.float32)

    # finding the maximum probability in ypred
    max_prob = tf.reduce_max(ypred, axis=-1, keepdims=True)

    # making ypred one hot encoded such that the class with the maximum probability as encoded as 1 while others as 0
    ypred = tf.cast(tf.equal(ypred, max_prob), tf.float32)
        
    self.tp.assign_add(tf.reduce_sum(ytrue*ypred, axis=0)) # updating true positives atrribute
    self.predicted_positives.assign_add(tf.reduce_sum(ypred, axis=0)) # updating predicted positives atrribute
    self.actual_positives.assign_add(tf.reduce_sum(ytrue, axis=0)) # updating actual positives atrribute

  def result(self):
    self.precision = self.tp/(self.predicted_positives+self.epsilon) # calculates precision
    self.recall = self.tp/(self.actual_positives+self.epsilon) # calculates recall

    # calculating fbeta score
    self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)

    if self.average == 'weighted':
      return tf.reduce_sum(self.fb*self.actual_positives / tf.reduce_sum(self.actual_positives))
    
    elif self.average == 'raw':
      return self.fb

    return tf.reduce_mean(self.fb)

  def reset_state(self):
    self.tp.assign(tf.zeros(self.n_class)) # resets true positives to zero
    self.predicted_positives.assign(tf.zeros(self.n_class)) # resets predicted positives to zero
    self.actual_positives.assign(tf.zeros(self.n_class)) # resets actual positives to zero

## CUSTOM CLASS FOR WEIGHTED F1

In [None]:
class StatefullMultiClassFBetaWeighted(Metric):   
  def __init__(self, name='state_full_binary_fbeta_weighted', beta=1, n_class=1, average='weighted', epsilon=1e-7, **kwargs):
    # initializing an object of the super class
    super(StatefullMultiClassFBetaWeighted, self).__init__(name=name, **kwargs)

    # initializing state variables
    self.tp = self.add_weight(name='tp', shape=(n_class,), initializer='zeros')     # initializing true positives
    self.actual_positives = self.add_weight(name='ap', shape=(n_class,), initializer='zeros') # initializing actual positives
    self.predicted_positives = self.add_weight(name='pp', shape=(n_class,), initializer='zeros') # initializing predicted positives

    # initializing other atrributes that wouldn't be changed for every object of this class
    self.beta_squared = beta**2
    self.n_class = n_class
    self.average = average
    self.epsilon = epsilon

  def update_state(self, ytrue, ypred, sample_weight=None):
    # casting ytrue and ypred as float dtype
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(ypred, tf.float32)

    # finding the maximum probability in ypred
    max_prob = tf.reduce_max(ypred, axis=-1, keepdims=True)

    # making ypred one hot encoded such that the class with the maximum probability as encoded as 1 while others as 0
    ypred = tf.cast(tf.equal(ypred, max_prob), tf.float32)
        
    self.tp.assign_add(tf.reduce_sum(ytrue*ypred, axis=0)) # updating true positives atrribute
    self.predicted_positives.assign_add(tf.reduce_sum(ypred, axis=0)) # updating predicted positives atrribute
    self.actual_positives.assign_add(tf.reduce_sum(ytrue, axis=0)) # updating actual positives atrribute

  def result(self):
    self.precision = self.tp/(self.predicted_positives+self.epsilon) # calculates precision
    self.recall = self.tp/(self.actual_positives+self.epsilon) # calculates recall

    # calculating fbeta score
    self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)

    if self.average == 'weighted':
      return tf.reduce_sum(self.fb*self.actual_positives / tf.reduce_sum(self.actual_positives))
    
    elif self.average == 'raw':
      return self.fb

    return tf.reduce_mean(self.fb)

  def reset_state(self):
    self.tp.assign(tf.zeros(self.n_class)) # resets true positives to zero
    self.predicted_positives.assign(tf.zeros(self.n_class)) # resets predicted positives to zero
    self.actual_positives.assign(tf.zeros(self.n_class)) # resets actual positives to zero

# MOUNT GOOGLE COLLAB FOR FILES

In [None]:
# Google collab mount. 

from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


# PREPARE STANDARD KOREAN DATA

In [None]:
# Take the standardized Korean data for all of our runs

# Pandas dataframe of Korean train data from google drive
train_df_korean = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_train.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
train_df_korean.rename(columns={"text": "comments"}, inplace=True)
train_df_korean["label"] = pd.to_numeric(train_df_korean["label"])

# Pandas dataframe of Korean dev/val data from google drive
dev_df_korean = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_dev.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
dev_df_korean.rename(columns={"text": "comments"}, inplace=True)
dev_df_korean["label"] = pd.to_numeric(dev_df_korean["label"])

# Pandas dataframe of  test data from google drive
test_df_korean = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_test.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )     
test_df_korean.rename(columns={"text": "comments"}, inplace=True)   
test_df_korean["label"] = pd.to_numeric(test_df_korean["label"])


print(test_df_korean.head())

                                            comments  label
0  팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0
1                  왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0
2                           여자들이 80프로잉 악플 남자는 여자욕 안해      1
3                     설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0
4                 다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1


## PRINT INFORMATION ABOUT STANDARD KOREAN DATA

In [None]:
# Print information about standardized Korean data
print("Korean train data has shape", train_df_korean.shape)
print("\n\nSamples from Korean train data")
print(train_df_korean.head(5))
print("\n\nKorean dev data has shape", dev_df_korean.shape)
print("\n\nSamples from Korean dev data")
print(dev_df_korean.head(5))
print("\n\nKorean test data has shape", test_df_korean.shape)
print("\n\nSamples from Korean test data")
print(test_df_korean.head(5))

Korean train data has shape (5833, 2)


Samples from Korean train data
                                            comments  label
0                                    불쌍해 보이는 이윤 뭘까?~      0
1                                            독과점의 결과      0
2                                    별 시덥지않은 악플들은 모냐      0
3  사랑의 불시착 하는시간인데 이상한 노잼 드라마가 하고있다 ㅡㅡ 어딨는거냐 표치수~~~~~      1
4                          저밖에몰라..남은사람 어쩌라고. 참 이기적이네      0


Korean dev data has shape (729, 2)


Samples from Korean dev data
                                            comments  label
0  다 그러고 애낳고 키웠고 다 그러고 산다예전에 우리엄마들은 어떻게 애키우며 밭일하고...      1
1                                이제 별 감흥도 없는 애를 멀...      1
2  홍상수 김민희 좋아하는 감독도 배우도 아니지만 남여관계 모르는 거다. 이렇게 비난받...      0
3                                           이쁘게 컸네^^      0
4                        아니 저 금동현 자리에 진우가 있어야 한다며 ??      0


Korean test data has shape (730, 2)


Samples from Korean test data
                                            comments  label
0  팀으로 데뷔한거

## CREATE A METHOD TO CREATE DATASETS FOR THE PIPELINE TO THE MODELS

In [None]:
# This is a way to get train and validation data ready to be embedded

# A utility method to create a tf.data dataset from a pandas dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dataframe, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds


## CREATE DATASETS FOR KOREAN TRAIN, DEV, TEST

In [None]:
# Next create the tf.data dataset

batch_size = 30
train_ds = df_to_dataset(train_df_korean, batch_size=batch_size)
val_ds = df_to_dataset(dev_df_korean, shuffle=False, batch_size=batch_size)

# Get test data ready to be embedded


test_ds = df_to_dataset(test_df_korean, shuffle=False, batch_size=batch_size)


In [None]:
# Look at tf.data dataset which was created for Korean Train data
#for feature_batch, label_batch in train_ds.take(1):
  #print('Every feature:', list(feature_batch.keys()))
  #print('A batch of comments:', feature_batch['comments'])
  #print('A batch of targets:', label_batch )


for feature_batch, label_batch in train_ds.take(1):
  print('A batch of comments: ', feature_batch)
  print('A batch of targets', label_batch)


A batch of comments:  tf.Tensor(
[[b'\xec\x8b\xac\xec\x9e\xa5\xeb\xa7\x88\xeb\xb9\x84\xeb\xa1\x9c \xea\xb8\xb0\xec\x82\xac \xec\x8d\xa8 \xeb\x8b\xac\xeb\x9d\xbc \xed\x96\x88\xea\xb2\xa0\xec\xa7\x80.\xec\x83\x9d\xed\x99\x9c\xea\xb3\xa0\xec\x97\x90 \xec\x8b\x9c\xeb\x8b\xac\xeb\xa0\xa4 \xeb\x82\x98\xec\x81\x9c \xec\x84\xa0\xed\x83\x9d\xed\x95\x9c \xea\xb2\x8c \xeb\xb6\x84\xeb\xaa\x85\xed\x95\x9c\xeb\x8d\xb0..\xe3\x85\x9c\xe3\x85\x9c\xec\x95\x94\xed\x8a\xbc \xec\x82\xbc\xea\xb0\x80 \xea\xb3\xa0\xec\x9d\xb8\xec\x9d\x98 \xeb\xaa\x85\xeb\xb3\xb5\xec\x9d\x84 \xeb\xb9\x95\xeb\x8b\x88\xeb\x8b\xa4.']
 [b'\xec\x9d\xb4\xeb\xaf\xbc\xed\x98\xb8 \xec\xb8\xa1\xea\xb7\xbc??\xec\x9d\xb4 \xea\xb8\xb0\xec\x82\xac\xeb\x8a\x94 \xeb\xad\x90\xeb\x83\x90?\xec\x9d\xb4\xeb\xaf\xbc\xed\x98\xb8 \xec\xb0\x8c\xec\xa7\x88\xeb\x82\xa8\xec\x9e\x84?\xec\x9d\xb4\xeb\xaf\xbc\xed\x98\xb8\xea\xb0\x80 \xed\x9d\x98\xeb\xa6\xb0\xea\xb1\xb0\xec\x95\xbc? \xec\xa0\x9c\xeb\xb0\x9c \xec\x88\x98\xec\xa7\x80\xec\xa2\x80 \xeb\x82\x98\x

In [None]:
# Let's check the size of the Korean train tensor
ds_size = tf.data.experimental.cardinality(
    train_ds
)
print("Number of tensors per batch for Korean train data: ", ds_size)

Number of tensors per batch for Korean train data:  tf.Tensor(195, shape=(), dtype=int64)


In [None]:
# Let's check the size of the dev/val Korean tensor and the  Korean test tensor

ds_size_dev = tf.data.experimental.cardinality(
    val_ds
)
print("Number of tensors per batch for Korean validation data: ", ds_size_dev)

# test tensor
ds_size_test = tf.data.experimental.cardinality(
    test_ds
)
print("Number of tensors per batch for Korean test data: ", ds_size_test)

Number of tensors per batch for Korean validation data:  tf.Tensor(25, shape=(), dtype=int64)
Number of tensors per batch for Korean test data:  tf.Tensor(25, shape=(), dtype=int64)


# PREPARE STANDARD HINDI DATA

## TAKE STANDARDIZED HINDI DATA

In [None]:
# Take the standardized Hindi data for all of our runs

# Pandas dataframe of Hindi train data from google drive
train_df_hindi = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_train.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
train_df_hindi.rename(columns={"text": "comments"}, inplace=True)
train_df_hindi["label"] = pd.to_numeric(train_df_hindi["label"])

# Pandas dataframe of Hindi dev/val data from google drive
dev_df_hindi = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_dev.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
dev_df_hindi.rename(columns={"text": "comments"}, inplace=True)
dev_df_hindi["label"] = pd.to_numeric(dev_df_hindi["label"])

# Pandas dataframe of Hindi test data from google drive
test_df_hindi = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_test.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )     
test_df_hindi.rename(columns={"text": "comments"}, inplace=True)   
test_df_hindi["label"] = pd.to_numeric(test_df_hindi["label"])

print(test_df_hindi.head())


                                            comments  label
0  @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0
1  @China_Amb_India @narendramodi I am shocked th...      0
2  कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0
3  इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0
4  ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1


## PRINT INFORMATION ABOUT STANDARD HINDI DATA

In [None]:
# Print information about standardized Hindi data
print("Hindi train data has shape", train_df_hindi.shape)
print("\n\nSamples from Hindi train data")
print(train_df_hindi.head(5))
print("\n\nHindi dev data has shape", dev_df_hindi.shape)
print("\n\nSamples from Hindi dev data")
print(dev_df_hindi.head(5))
print("\n\nHindi test data has shape", test_df_hindi.shape)
print("\n\nSamples from Hindi test data")
print(test_df_hindi.head(5))

Hindi train data has shape (4235, 2)


Samples from Hindi train data
                                            comments  label
0  RT @RishiPrasadOrg: ईश्वर किसीको प्राप्त नहीं ...      0
1            @BBCHindi इसे कहते हैं खतरो का खिलाड़ी।      0
2      ७० साल जतन के ७ साल पतन के  #IndiaCovidCrisis      0
3  RT @AagayiNavya: चंपारण में एक चीनी मिल बंद पड...      1
4  सिर्फ़ सरकारें बदलती हैं हालात नहीं बदलते l \n...      1


Hindi dev data has shape (529, 2)


Samples from Hindi dev data
                                            comments  label
0  #ModiKaVaccineJumla जुमला वाला प्रधानमंत्री,  ...      1
1                    Mar javunga aaj me khushi me 😍😍      0
2  RT @Sonam_Mumbaikar: तेरे इश्क़ में हद से गुजर...      0
3    @Physicsgaurav Ye sale pde likhe chutiya hai 😜🤘      1
4  @RachnaSinghSP अरे चाची हो चाची..  जब सैफई में...      1


Hindi test data has shape (530, 2)


Samples from Hindi test data
                                            comments  label
0  @zishanAliRJD 

## CREATE DATASETS FOR HINDI TRAIN, DEV, TEST

In [None]:
# Next create the tf.data dataset for hindi train and val/dev and also test

batch_size = 30
train_ds_hindi = df_to_dataset(train_df_hindi, batch_size=batch_size)
val_ds_hindi = df_to_dataset(dev_df_hindi, shuffle=False, batch_size=batch_size)

In [None]:
# Get test data ready to be embedded
batch_size = 30

test_ds_hindi = df_to_dataset(test_df_hindi, shuffle=False, batch_size=batch_size)

# PREPARE STANDARD ENGLISH HATE DATA. THIS WILL BE USED FOR ZERO-SHOT MODELS.


In [None]:
# Pandas dataframe of English train data from google drive
train_df_english = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/english_train.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
train_df_english.rename(columns={"text": "comments"}, inplace=True)
train_df_english["label"] = pd.to_numeric(train_df_english["label"])


# Pandas dataframe of English dev/val data from google drive
dev_df_english = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/english_dev.csv',
                       sep = ',', 
                       dtype={'text':str,'label':str}
                       )
dev_df_english.rename(columns={"text": "comments"}, inplace=True)
dev_df_english["label"] = pd.to_numeric(dev_df_english["label"])

print(train_df_english.head())
print(dev_df_english.head())

                                            comments  label
0  The user doing this is happy to see the gutted...      0
1  That page was listed a couple of days ago, but...      0
2                      I saw that, thanks. –  (talk)      0
3                         Fuck you you bitch ass ho.      1
4  I really hate you \n\nAnd I want to do nasty t...      1
                                            comments  label
0  Mortgage lending \n\nIs this the Commerce Bank...      0
1  Restore Irish E-Sports page \n\nBased  prior d...      0
2             Sorry, I thought no one would read it.      0
3  You have made five edits today, all were silly...      1
4  "\n\nImage copyright problem with Image:AUSEPa...      0


In [None]:
# Next create the tf.data dataset for English data

batch_size = 30
train_ds_english = df_to_dataset(train_df_english, batch_size=batch_size)
val_ds_english = df_to_dataset(dev_df_english, shuffle=False, batch_size=batch_size)


In [None]:
# Check a tensor for the English train data to see how it looks.
for feature_batch_english, label_batch_english in train_ds_english.take(1):
  print('A batch of comments: ', feature_batch_english)
  print('A batch of targets', label_batch_english)

A batch of comments:  tf.Tensor(
[[b'"\n\n Please do not vandalize pages, as you did with this edit to Mo. If you continue to do so, you will be blocked from editing.    "']
 [b'YOU PROVE MY POINT \n\nIF YOU BLOCK ME, YOU WOULD ONLY MAKE ME RIGHT. ALL I EVER ASKED WAS FOR YOU TO LEAVE ME ALONE, SO GO AWAY OR BLOCK ME SHIT BAG, IM TIRED OF THIS DISCUSSION ALREADY.']
 [b'"You have new messages (last change).\n"']
 [b'"\nLol, okay so that explains it. Thanks. ) a.k.a.Depu\xc2\xa0Joseph\xc2\xa0|TALK"']
 [b"How pathetic Dencord at you want to go around bullying just cause you can't get your own way. Grow up."]
 [b'"\n\n listen up, you narrow-eyed teacher living on minimum wage \n\nyou can keep on doing your ""buck buck"" chicken trash talk, or perhaps you might wanna consider moving back to lemuria - the land where your kind of scum roams."']
 [b'Hey asshole, nice try with the speedy deletion.']
 [b'"\n\n To whom it may concern \n\nOh go ahead and revert it.  Go ahead and increase the block

# GET POINTERS TO PRE-TRAINED LaBSE

In [None]:
# Pretrained LaBSE and preprocessor

# Using LaBSE 2

tfhub_handle_encoder = "https://tfhub.dev/google/LaBSE/2"

# Will need this for pre-processing the text. To get the tensors

tfhub_handle_preprocess = "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2"


# CREATE A FUNCTION SO CAN SAVE SETTINGS AND RESULTS HISTORY

In [None]:
#Function to add a record to a json file

def append_record(record):
    with open('/content/gdrive/MyDrive/266_datasets/settings/settings_history.txt', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

# BUILD LaBSE ONLY MODEL

## CREATE INSTANCE OF LaBSE ONLY MODEL FOR KOREAN DATA

In [None]:
# Use for F1 metrics on Korean data
statefull_binary_fbeta_korean_lab = StatefullBinaryFBeta() 
statefull_multi_class_fbeta_korean_lab = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_korean_lab_weighted = StatefullMultiClassFBetaWeighted()

In [None]:
# Use this method to build the model with just LaBSE

def build_lab(lr, first_layer, second_layer, first_drop, second_drop, third_drop, first_metric, second_metric, third_metric):
 text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name= 'comments')
 preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name= 'preprocessing')
 encoder_inputs = preprocessing_layer(text_input)
 encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='LaBSE_encoder')

 # This will get all the outputs as a dictionary
 outputs = encoder(encoder_inputs)

 # pooled output represents the entire example passed.
 net = outputs['pooled_output']

 # Normalization. tf.nn.l2_normalize is alias for tf.math.l2_normalize. Not needed for our instance
 # normalized_sentence_representation = tf.nn.l2_normalize(net, axis=-1)

 # first dropout layer 
 net = tf.keras.layers.Dropout(first_drop)(net)

 # a dense layer and relu
 net = tf.keras.layers.Dense(first_layer, activation='relu', kernel_initializer='he_uniform')(net)
 # normalization
 net = tf.keras.layers.LayerNormalization()(net)
 # second dropout layer 
 net = tf.keras.layers.Dropout(second_drop)(net)

 # a dense layer and relu
 net = tf.keras.layers.Dense(second_layer, activation='relu')(net)
 # normalization
 net = tf.keras.layers.LayerNormalization()(net)
 #  third dropout layer 
 net = tf.keras.layers.Dropout(third_drop)(net)

 # sigmoid
 net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)

 
 labse_model = tf.keras.Model(text_input, net)

 labse_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                    optimizer=tf.keras.optimizers.Adam(lr),
                     metrics= [first_metric, second_metric, third_metric])
 
 # Save the hyperparameters which were used
 settings = {'lr':lr,
              'first_layer_neurons': first_layer,
              'second_layer_neurons': second_layer,
              'first_drop': first_drop,
              'second_drop': second_drop,
              'third drop': third_drop,
              }
 return labse_model, settings


##  CALLBACK FOR EARLY STOP

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_state_full_binary_fbeta', mode="max", patience=3)

# LaBSE ONLY KOREAN MODEL INSTANTIATE

In [None]:
# Return model from function build_lab_cnn
# Use the following learning rate, dense layer neu, dense layer neur, drop-out rate, drop-out rate two, drop-out rate three
#args = (.001, 200, 128,  0.1, 0.2, 0) # These settings work better
args = (.001, 256, 128,  0, 0, 0) # These values are like the control mBERT settings.

# Return the following metrics
kwargs = {"first_metric": statefull_binary_fbeta_korean_lab, "second_metric": statefull_multi_class_fbeta_korean_lab , "third_metric": statefull_multi_class_fbeta_korean_lab_weighted}

# Return the model and the settings used in model
model_korean, settings = build_lab(*args, **kwargs)

## FIT  LaBSE ONLY MODEL FOR KOREAN TRAIN DATA

In [None]:
model_korean.reset_states()

In [None]:
# Fit the LaBSE model


history_korean_lab = model_korean.fit(x=train_ds,validation_data=val_ds,
                               epochs=15, batch_size=30, callbacks=[callback])

append_record({'language':"korean", 'setings':settings, 'history':history_korean_lab.history})

Epoch 1/15

  m.reset_state()
  m.reset_state()
  m.reset_state()


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


## MAKE SURE NO OVERFITTING ON KOREAN DATA

In [None]:
# Let's look at a few values to make sure there was no overfitting on the train data
korean_lab_preds = model_korean.predict(train_ds, batch_size = 10)


In [None]:
korean_lab_preds_df = pd.DataFrame(korean_lab_preds, columns=['predicted_train_vals'])

In [None]:
print(korean_lab_preds_df.head(20))

    predicted_train_vals
0               0.018118
1               0.082727
2               0.871191
3               0.986051
4               0.917859
5               0.008914
6               0.002901
7               0.949750
8               0.999072
9               0.977956
10              0.953331
11              0.976660
12              0.517538
13              0.981117
14              0.016906
15              0.137702
16              0.061911
17              0.999131
18              0.004190
19              0.004186


## EVAL LaBSE ONLY MODEL ON KOREAN TEST DATA

In [None]:
# Eval on korean test data.
history_korean_lab_test = model_korean.evaluate(
                                                x=test_ds,
                                                batch_size=None,
                                                verbose=1,
                                                sample_weight=None,
                                                steps=None,
                                                callbacks=None,
                                                max_queue_size=10,
                                                workers=1,
                                                use_multiprocessing=False,
                                                return_dict=False,

                                                )

append_record({'language':"korean_test", 'history':history_korean_lab_test})

  m.reset_state()
  m.reset_state()
  m.reset_state()




## CREATE A FILE FOR KOREAN TEST DATA THAT HAS ALL THE PREDICTIONS

In [None]:
korean_lab_preds_test = model_korean.predict(test_ds, batch_size = 10)
korean_lab_preds_test_df = pd.DataFrame(korean_lab_preds_test, columns=['predicted_test_vals'])

korean_lab_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_lab_predictions_two.csv')

# FIT A LaBSE ONLY MODEL WITH HINDI

In [None]:
# Custom F1 metric

statefull_binary_fbeta_hindi_lab = StatefullBinaryFBeta() 

statefull_multi_class_fbeta_hindi_lab = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_hindi_lab_weighted = StatefullMultiClassFBetaWeighted()

In [None]:
# Build a LaBSE only model with just Hindi train data


# Return model from function build_lab
#args = (.001, 256, 128,  0, 0, 0)
args = (.001, 200, 128,  0.1, 0.2, 0)

kwargs = {"first_metric": statefull_binary_fbeta_hindi_lab, "second_metric": statefull_multi_class_fbeta_hindi_lab , "third_metric": statefull_multi_class_fbeta_hindi_lab_weighted}
model_hindi, settings = build_lab(*args, **kwargs)

In [None]:
model_hindi.reset_states()

In [None]:
# Fit LaBSE only model for Hindi 

history_hindi_lab = model_hindi.fit(x=train_ds_hindi,validation_data=val_ds_hindi,
                               epochs=15, batch_size=30, callbacks=[callback])

append_record({'language':"hindi", 'setings':settings, 'history':history_hindi_lab.history})

Epoch 1/15

  m.reset_state()
  m.reset_state()
  m.reset_state()


Epoch 2/15
Epoch 3/15
Epoch 4/15


In [None]:
# Eval on hindi test data
history_hindi_lab_test = model_hindi.evaluate(
                                              x=test_ds_hindi,
                                              batch_size=None,
                                              verbose=1,
                                              sample_weight=None,
                                              steps=None,
                                              callbacks=None,
                                              max_queue_size=10,
                                              workers=1,
                                              use_multiprocessing=False,
                                              return_dict=False,

                                              )

append_record({'language':"hindi_test", 'history':history_hindi_lab_test})

  m.reset_state()
  m.reset_state()
  m.reset_state()




In [None]:
hindi_lab_preds_test = model_hindi.predict(test_ds_hindi, batch_size = 10)
hindi_lab_preds_test_df = pd.DataFrame(hindi_lab_preds_test, columns=['predicted_test_vals'])

hindi_lab_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_lab_predictions_two.csv')

# DO ZERO SHOT LEARNING WITH LaBSE ONLY MODEL USING ENGLISH TO TRAIN AND KOREAN TO EVAL

In [None]:
# Use for F1 metrics on English data
statefull_binary_fbeta_english_lab = StatefullBinaryFBeta() 
statefull_multi_class_fbeta_english_lab = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_english_lab_weighted = StatefullMultiClassFBetaWeighted()

In [None]:
# Return model from function build_lab_cnn
# Use the following learning rate, dense layer neu, dense layer neur, drop-out rate, drop-out rate two, drop-out rate three
args = (.001, 200, 128,  0.1, 0.2, 0) # These settings work better
#args = (.001, 256, 128,  0, 0, 0) # These values are like the control mBERT settings.

# Return the following metrics
kwargs = {"first_metric": statefull_binary_fbeta_english_lab, "second_metric": statefull_multi_class_fbeta_english_lab , "third_metric": statefull_multi_class_fbeta_english_lab_weighted}

# Return the model and the settings used in model
model_english, settings = build_lab(*args, **kwargs)

KeyboardInterrupt: ignored

## FIT A MODEL ON ENGLISH DATA

In [None]:
model_english.reset_states()


In [None]:
# Fit the LaBSE model


history_english_lab = model_english.fit(x=train_ds_english,validation_data=val_ds_english,
                               epochs=15, batch_size=30, callbacks=[callback])

append_record({'language':"english", 'setings':settings, 'history':history_english_lab.history})

## EVALUATE KOREAN TEST DATA ON ENGLISH TRAINED LaBSE

In [None]:
# Zero- shot. Trained on English and eval on korean test data.
history_eng_korean_lab_test = model_english.evaluate(
                                                x=test_ds,
                                                batch_size=None,
                                                verbose=1,
                                                sample_weight=None,
                                                steps=None,
                                                callbacks=None,
                                                max_queue_size=10,
                                                workers=1,
                                                use_multiprocessing=False,
                                                return_dict=False,

                                                )

append_record({'language':"engligh_korean_test", 'history':history_eng_korean_lab_test})

In [None]:
# PREDICT ON KOREAN TEST DATA

korean_eng_lab_preds_test = model_english.predict(test_ds, batch_size = 10)
korean_eng_lab_preds_test_df = pd.DataFrame(korean_eng_lab_preds_test, columns=['predicted_test_vals'])

korean_eng_lab_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/korean_eng_lab_predictions_two.csv')


## EVALUATE HINDI TEST ON ENGLISH TRAINED LaBSE

In [None]:
# Zero- hot. Trained on English and eval on hindi test data.
history_eng_hindi_lab_test = model_english.evaluate(
                                                x=test_ds_hindi,
                                                batch_size=None,
                                                verbose=1,
                                                sample_weight=None,
                                                steps=None,
                                                callbacks=None,
                                                max_queue_size=10,
                                                workers=1,
                                                use_multiprocessing=False,
                                                return_dict=False,

                                                )

append_record({'language':"english_hindi_test", 'history':history_eng_hindi_lab_test})

In [None]:
# Predict on Hindi test data
hindi_eng_lab_preds_test = model_english.predict(test_ds_hindi, batch_size = 10)
hindi_eng_lab_preds_test_df = pd.DataFrame(hindi_eng_lab_preds_test, columns=['predicted_test_vals'])

hindi_eng_lab_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/hindi_eng_lab_predictions_two.csv')

In [None]:
print(test_df_hindi.head(45))

# DEFINE A MODEL WITH LaBSE AND CNN

## USE CUSTOM F1 METRIC CLASS ON LaBSE WITH CNN KOREAN MODEL

In [None]:
statefull_binary_fbeta_korean_lab_cnn = StatefullBinaryFBeta() 
statefull_multi_class_fbeta_korean_lab_cnn = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_korean_lab_cnn_weighted = StatefullMultiClassFBetaWeighted()


In [None]:
# Using a model with LaBSE with CNN on top instead

#num_filters = [32, 32, 32, 32, 32]
#kernel_sizes = [1, 2, 3, 4,5]
#dense_layer_dims = [10, 4]

#num_classes = len(ds.target_names)

def build_lab_cnn(lr, first_layer, second_layer, first_drop, num_filters, kernel_sizes, first_metric, second_metric, third_metric):
 text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name= 'comments')
 preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name= 'preprocessing')
 encoder_inputs = preprocessing_layer(text_input)
 encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='LaBSE_encoder')

 # This will get all the outputs as a dictionary
 outputs = encoder(encoder_inputs)

 
 # sequence output is contextual embedding of each token
 net = outputs['sequence_output']

 conv_layers_for_all_kernel_sizes = []
 for kernel_size, filters in zip(kernel_sizes, num_filters):
    conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(net)
    conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
    conv_layers_for_all_kernel_sizes.append(conv_layer)

 # Concat the feature maps from each different size. Flattening
 h = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)

 h = keras.layers.Dropout(rate=first_drop)(h)
 
 prediction = keras.layers.Dense(1, activation='sigmoid')(h)


 labse_cnn_model = tf.keras.Model(inputs=text_input, outputs=prediction)

 # Compile the model
 # Logit set to false because already have sigmoid in model design

 labse_cnn_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                          optimizer=tf.keras.optimizers.Adam(lr=lr),
                          metrics=[first_metric, second_metric, third_metric])

 settings_lab_cnn = {'lr':lr,
                     'first_layer_neurons': first_layer,
                     'second_layer_neurons': second_layer,
                     'first_drop': first_drop,
                     'num_filters': num_filters,
                     'kernel_sizes': kernel_sizes
                    }


 return labse_cnn_model, settings_lab_cnn


## CREATE INSTANCE OF LaBSE WITH CNN MODEL FOR KOREAN DATA

In [None]:
# Return model from function build_lab_cnn
# Use the following learning rate, dense layer neu, dense layer neur, drop-out rate
#args = (.001, 200, 128,  0.1, 0.2, 0) # These settings work better
#args = (.001, 256, 128,  0, 0, 0) # These values are like the control mBERT settings.
num_filters = [32, 32, 32, 32, 32]
kernel_sizes = [1, 2, 3, 4,5]
args = (0.00002, 0,0,0.7)

# Return the following metrics
kwargs = {"num_filters": num_filters, "kernel_sizes": kernel_sizes,"first_metric": statefull_binary_fbeta_korean_lab_cnn, "second_metric": statefull_multi_class_fbeta_korean_lab_cnn , "third_metric": statefull_multi_class_fbeta_korean_lab_cnn_weighted}

# Return the model and the settings used in model
model_korean_two, settings_lab_cnn = build_lab_cnn(*args, **kwargs)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
model_korean_two.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 comments (InputLayer)          [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_type_ids':   0           ['comments[0][0]']               
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

## FIT LaBSE WITH CNN KOREAN MODEL

In [None]:
model_korean_two.reset_states()
history_lab_cnn_korean = model_korean_two.fit(x=train_ds,validation_data=val_ds,
                               epochs=15, batch_size=30, callbacks=[callback])


append_record({'language':"korean", 'setings':settings_lab_cnn, 'history':history_lab_cnn_korean.history})

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## EVAL KOREAN TEST DATA ON LaBSE WITH CNN MODEL 

In [None]:
# Eval on korean dev data - use this if using the second way to use the input data
history_korean_lab_cnn_test = model_korean_two.evaluate(
    x=test_ds,
    batch_size=None,
    verbose=1,
    sample_weight=None,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False,

)


append_record({'language':"korean_test", 'history':history_korean_lab_cnn_test})



In [None]:
korean_lab_cnn_preds_test = model_korean_two.predict(test_ds, batch_size = 10)
korean_lab_cnn_preds_test_df = pd.DataFrame(korean_lab_cnn_preds_test, columns=['predicted_test_vals'])

korean_lab_cnn_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_lab_cnn_predictions.csv')

# BUILD A LaBSE PLUS CNN FOR HINDI

## USE CUSTOM F1 METRIC CLASS ON LaBSE WITH CNN HINDI MODEL

In [None]:
# F1 score from custom module for hindi lab/cnn
statefull_binary_fbeta_hindi_lab_cnn = StatefullBinaryFBeta() 
statefull_multi_class_fbeta_hindi_lab_cnn = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_hindi_lab_cnn_weighted = StatefullMultiClassFBetaWeighted()

In [None]:

num_filters = [32, 32, 32, 32, 32]
kernel_sizes = [1, 2, 3, 4,5]
args = (0.00002, 0,0,0.7)

# Return the following metrics
kwargs = {"num_filters": num_filters, "kernel_sizes": kernel_sizes,"first_metric": statefull_binary_fbeta_hindi_lab_cnn, "second_metric": statefull_multi_class_fbeta_hindi_lab_cnn , "third_metric": statefull_multi_class_fbeta_hindi_lab_cnn_weighted}

# Return the model and the settings used in model
model_hindi_two, settings_lab_cnn = build_lab_cnn(*args, **kwargs)




  super(Adam, self).__init__(name, **kwargs)


## FIT LaBSE WITH CNN FOR HINDI

In [None]:
model_hindi_two.reset_states()
history_lab_cnn_hindi = model_hindi_two.fit(x=train_ds_hindi,validation_data=val_ds_hindi,
                               epochs=15, batch_size=30, callbacks=[callback])



append_record({'language':"hindi", 'setings':settings_lab_cnn, 'history':history_lab_cnn_hindi.history})

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15


In [None]:
history_hindi_lab_cnn_test = model_hindi_two.evaluate(
    x=test_ds_hindi,
    batch_size=None,
    verbose=1,
    sample_weight=None,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False,

)


append_record({'language':"hindi_test", 'history':history_hindi_lab_cnn_test})



In [None]:
hindi_lab_cnn_preds_test = model_hindi_two.predict(test_ds_hindi, batch_size = 10)
hindi_lab_cnn_preds_test_df = pd.DataFrame(hindi_lab_cnn_preds_test, columns=['predicted_test_vals'])

hindi_lab_cnn_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_lab_cnn_predictions.csv')

# DO ZERO SHOT LEARNING ON LaBSE WITH CNN MODEL

In [None]:
# Use for F1 metrics on English data
statefull_binary_fbeta_english_lab_cnn = StatefullBinaryFBeta() 
statefull_multi_class_fbeta_english_lab_cnn = StatefullMultiClassFBeta()

statefull_multi_class_fbeta_english_lab_cnn_weighted = StatefullMultiClassFBetaWeighted()

In [None]:
num_filters = [32, 32, 32, 32, 32]
kernel_sizes = [1, 2, 3, 4,5]
args = (0.00002, 0,0,0.7)

# Return the following metrics
kwargs = {"num_filters": num_filters, "kernel_sizes": kernel_sizes,"first_metric": statefull_binary_fbeta_english_lab_cnn, "second_metric": statefull_multi_class_fbeta_english_lab_cnn , "third_metric": statefull_multi_class_fbeta_english_lab_cnn_weighted}

# Return the model and the settings used in model
model_english_two, settings_lab_cnn = build_lab_cnn(*args, **kwargs)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
model_english_two.reset_states()
history_lab_cnn_english = model_english_two.fit(x=train_ds_english,validation_data=val_ds_english,
                               epochs=15, batch_size=30, callbacks=[callback])



append_record({'language':"english", 'setings':settings_lab_cnn, 'history':history_lab_cnn_english.history})

Epoch 1/15

  m.reset_state()
  m.reset_state()
  m.reset_state()


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
# Zero- shot. Trained on English and eval on korean test data.
history_eng_korean_lab_cnn_test = model_english_two.evaluate(
                                                x=test_ds,
                                                batch_size=None,
                                                verbose=1,
                                                sample_weight=None,
                                                steps=None,
                                                callbacks=None,
                                                max_queue_size=10,
                                                workers=1,
                                                use_multiprocessing=False,
                                                return_dict=False,

                                                )

append_record({'language':"english_korean_test", 'history':history_eng_korean_lab_cnn_test})

  m.reset_state()
  m.reset_state()
  m.reset_state()




In [None]:
# PREDICT ON KOREAN TEST DATA

korean_eng_lab_cnn_preds_test = model_english_two.predict(test_ds, batch_size = 10)
korean_eng_lab_cnn_preds_test_df = pd.DataFrame(korean_eng_lab_cnn_preds_test, columns=['predicted_test_vals'])

korean_eng_lab_cnn_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/korean_eng_lab_cnn_predictions.csv')

In [None]:
# Zero- shot. Trained on English and eval on hindi test data.
history_eng_hindi_lab_cnn_test = model_english_two.evaluate(
                                                x=test_ds_hindi,
                                                batch_size=None,
                                                verbose=1,
                                                sample_weight=None,
                                                steps=None,
                                                callbacks=None,
                                                max_queue_size=10,
                                                workers=1,
                                                use_multiprocessing=False,
                                                return_dict=False,

                                                )

append_record({'language':"english_hindi_test", 'history':history_eng_hindi_lab_cnn_test})

  m.reset_state()
  m.reset_state()
  m.reset_state()




In [None]:
# PREDICT ON HINDI TEST DATA

hindi_eng_lab_cnn_preds_test = model_english_two.predict(test_ds_hindi, batch_size = 10)
hindi_eng_lab_cnn_preds_test_df = pd.DataFrame(hindi_eng_lab_cnn_preds_test, columns=['predicted_test_vals'])

hindi_eng_lab_cnn_preds_test_df.to_csv('/content/gdrive/My Drive/266_datasets/standard_english_data/hindi_eng_lab_cnn_predictions.csv')

# HAND TEST F1 SCORE FOR LaBSE with CNN FOR KOREAN

In [None]:
# Hand check F1 score

# Load the Korean Lab CNN test predictions
test_df_korean_lab_cnn_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_lab_cnn_predictions.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_korean_lab_cnn_predicted['Actual Predictions Korean Lab CNN'] = np.where(test_df_korean_lab_cnn_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_korean_lab_cnn_predicted = test_df_korean_lab_cnn_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_korean_lab_cnn_concat = pd.concat([test_df_korean, test_df_korean_lab_cnn_predicted], axis=1)


# Find TP
test_df_korean_lab_cnn_concat['TP_FP'] = np.where((test_df_korean_lab_cnn_concat['Actual Predictions Korean Lab CNN'] == test_df_korean_lab_cnn_concat['label']) &  (test_df_korean_lab_cnn_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_korean_lab_cnn_concat['TP_FP'] = np.where((test_df_korean_lab_cnn_concat['Actual Predictions Korean Lab CNN'] == 1) &  (test_df_korean_lab_cnn_concat['label'] ==0), 'FP', test_df_korean_lab_cnn_concat['TP_FP'])


# Find FN
test_df_korean_lab_cnn_concat['TP_FP'] = np.where((test_df_korean_lab_cnn_concat['Actual Predictions Korean Lab CNN'] == 0) &  (test_df_korean_lab_cnn_concat['label'] ==1), 'FN', test_df_korean_lab_cnn_concat['TP_FP'])

# Find TN
test_df_korean_lab_cnn_concat['TP_FP'] = np.where((test_df_korean_lab_cnn_concat['Actual Predictions Korean Lab CNN'] == 0) &  (test_df_korean_lab_cnn_concat['label'] ==0), 'TN', test_df_korean_lab_cnn_concat['TP_FP'])

print(test_df_korean_lab_cnn_concat.head(20))


# Count TP
count_tp_korean_lab_cnn = test_df_korean_lab_cnn_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP", count_tp_korean_lab_cnn)


# Count FP
count_fp_korean_lab_cnn = test_df_korean_lab_cnn_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_korean_lab_cnn)

# Count FN
count_fn_korean_lab_cnn = test_df_korean_lab_cnn_concat['TP_FP'].value_counts()['FN']
print("Count of FN", count_fn_korean_lab_cnn)

# Count TN
count_tn_korean_lab_cnn = test_df_korean_lab_cnn_concat['TP_FP'].value_counts()['TN']
print("Count of TN", count_tn_korean_lab_cnn)

# Calculate Precision and Recall and F1 Score
precision_korean_lab_cnn = count_tp_korean_lab_cnn / (count_tp_korean_lab_cnn + count_fp_korean_lab_cnn)
print("precision", precision_korean_lab_cnn)

recall_korean_lab_cnn = count_tp_korean_lab_cnn / (count_tp_korean_lab_cnn  + count_fn_korean_lab_cnn)

print("recall", recall_korean_lab_cnn)
f1_score_korean_lab_cnn = (2 * precision_korean_lab_cnn * recall_korean_lab_cnn) / (precision_korean_lab_cnn  + recall_korean_lab_cnn)


print("F1 score is: ", f1_score_korean_lab_cnn )


                                             comments  label  \
0   팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0   
1                   왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0   
2                            여자들이 80프로잉 악플 남자는 여자욕 안해      1   
3                      설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0   
4                  다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1   
5                              광규형님 성국이형팬입니다 꼭좀 전해주세요      0   
6                    남자가 사회 생활하다 보면 손이 스칠수도 있는거지 거참~~      1   
7         여자도 출신이 의심스러움.사랑했던관계라기보다는남자발목잡으려고 그러는거같은데..      1   
8                                            ? 대세배우 ?      0   
9                                 개그맨 김경민 닮은다고 나만 느낌?      0   
10                                   중환자실이면 살아도 죽은것이여      0   
11  현실적으로 둘다 유명하지 않아서 수입도 불안정하고.. 그냥 헤어지자 해서 헤어지는 듯..      0   
12  이나영 보며 이쁘다..이쁘다..역시이쁘다..어쩜 저리 하나도 안변하고 아이낳고두 이...      0   
13  ㅋㅋ 역시 1화 방영후 말들이 많네 참고로 미스터션샤인도 1화 나오고 재미없다느니 ...      0   
14                                     이

# HAND TEST F1 SCORE FOR LaBSE PLUS CNN FOR HINDI

In [None]:
# Hand check F1 score for Hindi Lab CNN Model


# Load the Korean Lab CNN test predictions
test_df_hindi_lab_cnn_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_lab_cnn_predictions.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_hindi_lab_cnn_predicted['Actual Predictions Hindi Lab CNN'] = np.where(test_df_hindi_lab_cnn_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_hindi_lab_cnn_predicted = test_df_hindi_lab_cnn_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_hindi_lab_cnn_concat = pd.concat([test_df_hindi, test_df_hindi_lab_cnn_predicted], axis=1)


# Find TP
test_df_hindi_lab_cnn_concat['TP_FP'] = np.where((test_df_hindi_lab_cnn_concat['Actual Predictions Hindi Lab CNN'] == test_df_hindi_lab_cnn_concat['label']) &  (test_df_hindi_lab_cnn_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_hindi_lab_cnn_concat['TP_FP'] = np.where((test_df_hindi_lab_cnn_concat['Actual Predictions Hindi Lab CNN'] == 1) &  (test_df_hindi_lab_cnn_concat['label'] ==0), 'FP', test_df_hindi_lab_cnn_concat['TP_FP'])


# Find FN
test_df_hindi_lab_cnn_concat['TP_FP'] = np.where((test_df_hindi_lab_cnn_concat['Actual Predictions Hindi Lab CNN'] == 0) &  (test_df_hindi_lab_cnn_concat['label'] ==1), 'FN', test_df_hindi_lab_cnn_concat['TP_FP'])

# Find TN
test_df_hindi_lab_cnn_concat['TP_FP'] = np.where((test_df_hindi_lab_cnn_concat['Actual Predictions Hindi Lab CNN'] == 0) &  (test_df_hindi_lab_cnn_concat['label'] ==0), 'TN', test_df_hindi_lab_cnn_concat['TP_FP'])
print(test_df_hindi_lab_cnn_concat.head(20))


# Count TP
count_tp_hindi_lab_cnn = test_df_hindi_lab_cnn_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP",count_tp_hindi_lab_cnn)


# Count FP
count_fp_hindi_lab_cnn = test_df_hindi_lab_cnn_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_hindi_lab_cnn)

# Count FN
count_fn_hindi_lab_cnn = test_df_hindi_lab_cnn_concat['TP_FP'].value_counts()['FN']
print("Count of FN",count_fn_hindi_lab_cnn)


# Count TN
count_tn_hindi_lab_cnn = test_df_hindi_lab_cnn_concat['TP_FP'].value_counts()['TN']
print("Count of TN",count_tn_hindi_lab_cnn)


# Calculate Precision and Recall and F1 Score
precision_hindi_lab_cnn = count_tp_hindi_lab_cnn / (count_tp_hindi_lab_cnn + count_fp_hindi_lab_cnn)
print("precision", precision_hindi_lab_cnn)

recall_hindi_lab_cnn = count_tp_hindi_lab_cnn / (count_tp_hindi_lab_cnn  + count_fn_hindi_lab_cnn)

print("recall", recall_hindi_lab_cnn)
f1_score_hindi_lab_cnn = (2 * precision_hindi_lab_cnn * recall_hindi_lab_cnn) / (precision_hindi_lab_cnn  + recall_hindi_lab_cnn)


print("\n\nF1 score for Hindi Lab CNN is: ", f1_score_hindi_lab_cnn )




                                             comments  label  \
0   @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0   
1   @China_Amb_India @narendramodi I am shocked th...      0   
2   कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0   
3   इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0   
4   ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1   
5   कबीर परमेश्वर जी ने अध्यात्म का विधान बताया है...      0   
6   1990 रिपीट हो चुका है,1 लाख से ज्यादा हिन्दू ल...      0   
7   "MODI MEANS TERROR"  #नरेंद्र_मोदी_ग्लोबल_पप्प...      1   
8                      @Asrathour_143 और हिंदी में 😜😜      0   
9   केंद्र में होते हुए भी, यदि आप कुछ नहीं कर पा ...      0   
10  पप्पू लाल  इस लिए हाथ दिखा रहा है वह जानता है ...      1   
11  @chitraaum बहन आपसे उम्मित हे आप इस विषय पर सं...      0   
12  RT @MdNadim_BGS: ₹21000Cr की लागत से बनी भारत ...      0   
13  It’s Time To Come Together To Save Farmers &am...      0   
14  @anandkalra69 @Vandana__Indian हिन्द

# HAND TEST F1 SCORE LABSE OPTIMIZED HYPERPARAMETERS - KOREAN

In [None]:
# Hand check F1 score

# Load the Korean Lab  test predictions
test_df_korean_lab_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_lab_predictions_two.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_korean_lab_predicted['Actual Predictions Korean Lab'] = np.where(test_df_korean_lab_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_korean_lab_predicted = test_df_korean_lab_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_korean_lab_concat = pd.concat([test_df_korean, test_df_korean_lab_predicted], axis=1)


# Find TP
test_df_korean_lab_concat['TP_FP'] = np.where((test_df_korean_lab_concat['Actual Predictions Korean Lab'] == test_df_korean_lab_concat['label']) &  (test_df_korean_lab_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_korean_lab_concat['TP_FP'] = np.where((test_df_korean_lab_concat['Actual Predictions Korean Lab'] == 1) &  (test_df_korean_lab_concat['label'] ==0), 'FP', test_df_korean_lab_concat['TP_FP'])


# Find FN
test_df_korean_lab_concat['TP_FP'] = np.where((test_df_korean_lab_concat['Actual Predictions Korean Lab'] == 0) &  (test_df_korean_lab_concat['label'] ==1), 'FN', test_df_korean_lab_concat['TP_FP'])


# Find TN
test_df_korean_lab_concat['TP_FP'] = np.where((test_df_korean_lab_concat['Actual Predictions Korean Lab'] == 0) &  (test_df_korean_lab_concat['label'] ==0), 'TN', test_df_korean_lab_concat['TP_FP'])

print(test_df_korean_lab_concat.head(20))


# Count TP
count_tp_korean_lab = test_df_korean_lab_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP", count_tp_korean_lab)


# Count FP
count_fp_korean_lab = test_df_korean_lab_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_korean_lab)

# Count FN
count_fn_korean_lab = test_df_korean_lab_concat['TP_FP'].value_counts()['FN']
print("Count of FN", count_fn_korean_lab)

# Count TN
count_tn_korean_lab = test_df_korean_lab_concat['TP_FP'].value_counts()['TN']
print("Count of TN", count_tn_korean_lab)

# Calculate Precision and Recall and F1 Score
precision_korean_lab = count_tp_korean_lab / (count_tp_korean_lab + count_fp_korean_lab)
print("precision", precision_korean_lab)

recall_korean_lab = count_tp_korean_lab / (count_tp_korean_lab  + count_fn_korean_lab)

print("recall", recall_korean_lab)
f1_score_korean_lab = (2 * precision_korean_lab * recall_korean_lab) / (precision_korean_lab  + recall_korean_lab)


print("F1 score is: ", f1_score_korean_lab )


                                             comments  label  \
0   팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0   
1                   왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0   
2                            여자들이 80프로잉 악플 남자는 여자욕 안해      1   
3                      설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0   
4                  다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1   
5                              광규형님 성국이형팬입니다 꼭좀 전해주세요      0   
6                    남자가 사회 생활하다 보면 손이 스칠수도 있는거지 거참~~      1   
7         여자도 출신이 의심스러움.사랑했던관계라기보다는남자발목잡으려고 그러는거같은데..      1   
8                                            ? 대세배우 ?      0   
9                                 개그맨 김경민 닮은다고 나만 느낌?      0   
10                                   중환자실이면 살아도 죽은것이여      0   
11  현실적으로 둘다 유명하지 않아서 수입도 불안정하고.. 그냥 헤어지자 해서 헤어지는 듯..      0   
12  이나영 보며 이쁘다..이쁘다..역시이쁘다..어쩜 저리 하나도 안변하고 아이낳고두 이...      0   
13  ㅋㅋ 역시 1화 방영후 말들이 많네 참고로 미스터션샤인도 1화 나오고 재미없다느니 ...      0   
14                                     이

# HAND CALCULATE LaBSE ONLY OPTIMIZED HYPERPARAMETERS - HINDI

In [None]:
# Hand check F1 score for Hindi Lab Model


# Load the Korean Lab CNN test predictions
test_df_hindi_lab_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_lab_predictions_two.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_hindi_lab_predicted['Actual Predictions Hindi Lab'] = np.where(test_df_hindi_lab_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_hindi_lab_predicted = test_df_hindi_lab_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_hindi_lab_concat = pd.concat([test_df_hindi, test_df_hindi_lab_predicted], axis=1)


# Find TP
test_df_hindi_lab_concat['TP_FP'] = np.where((test_df_hindi_lab_concat['Actual Predictions Hindi Lab'] == test_df_hindi_lab_concat['label']) &  (test_df_hindi_lab_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_hindi_lab_concat['TP_FP'] = np.where((test_df_hindi_lab_concat['Actual Predictions Hindi Lab'] == 1) &  (test_df_hindi_lab_concat['label'] ==0), 'FP', test_df_hindi_lab_concat['TP_FP'])


# Find FN
test_df_hindi_lab_concat['TP_FP'] = np.where((test_df_hindi_lab_concat['Actual Predictions Hindi Lab'] == 0) &  (test_df_hindi_lab_concat['label'] ==1), 'FN', test_df_hindi_lab_concat['TP_FP'])


# Find TN
test_df_hindi_lab_concat['TP_FP'] = np.where((test_df_hindi_lab_concat['Actual Predictions Hindi Lab'] == 0) &  (test_df_hindi_lab_concat['label'] ==0), 'TN', test_df_hindi_lab_concat['TP_FP'])

print(test_df_hindi_lab_concat.head(20))


# Count TP
count_tp_hindi_lab = test_df_hindi_lab_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP",count_tp_hindi_lab)


# Count FP
count_fp_hindi_lab = test_df_hindi_lab_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_hindi_lab)

# Count FN
count_fn_hindi_lab = test_df_hindi_lab_concat['TP_FP'].value_counts()['FN']
print("Count of FN",count_fn_hindi_lab)

# Count TN
count_tn_hindi_lab = test_df_hindi_lab_concat['TP_FP'].value_counts()['TN']
print("Count of TN",count_tn_hindi_lab)

# Calculate Precision and Recall and F1 Score
precision_hindi_lab = count_tp_hindi_lab / (count_tp_hindi_lab + count_fp_hindi_lab)
print("precision", precision_hindi_lab)

recall_hindi_lab = count_tp_hindi_lab / (count_tp_hindi_lab  + count_fn_hindi_lab)

print("recall", recall_hindi_lab)
f1_score_hindi_lab = (2 * precision_hindi_lab * recall_hindi_lab) / (precision_hindi_lab  + recall_hindi_lab)


print("\n\nF1 score for Hindi Lab is: ", f1_score_hindi_lab )


                                             comments  label  \
0   @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0   
1   @China_Amb_India @narendramodi I am shocked th...      0   
2   कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0   
3   इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0   
4   ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1   
5   कबीर परमेश्वर जी ने अध्यात्म का विधान बताया है...      0   
6   1990 रिपीट हो चुका है,1 लाख से ज्यादा हिन्दू ल...      0   
7   "MODI MEANS TERROR"  #नरेंद्र_मोदी_ग्लोबल_पप्प...      1   
8                      @Asrathour_143 और हिंदी में 😜😜      0   
9   केंद्र में होते हुए भी, यदि आप कुछ नहीं कर पा ...      0   
10  पप्पू लाल  इस लिए हाथ दिखा रहा है वह जानता है ...      1   
11  @chitraaum बहन आपसे उम्मित हे आप इस विषय पर सं...      0   
12  RT @MdNadim_BGS: ₹21000Cr की लागत से बनी भारत ...      0   
13  It’s Time To Come Together To Save Farmers &am...      0   
14  @anandkalra69 @Vandana__Indian हिन्द

# HAND TEST F1 SCORE FOR KOREAN LABSE BASELINE PARAMETER MODEL

In [None]:
# Hand check F1 score

# Load the Korean Lab  test predictions
test_df_korean_lab_base_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_lab_predictions.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_korean_lab_base_predicted['Actual Predictions Korean Lab Base'] = np.where(test_df_korean_lab_base_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_korean_lab_base_predicted = test_df_korean_lab_base_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_korean_lab_base_concat = pd.concat([test_df_korean, test_df_korean_lab_base_predicted], axis=1)


# Find TP
test_df_korean_lab_base_concat['TP_FP'] = np.where((test_df_korean_lab_base_concat['Actual Predictions Korean Lab Base'] == test_df_korean_lab_base_concat['label']) &  (test_df_korean_lab_base_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_korean_lab_base_concat['TP_FP'] = np.where((test_df_korean_lab_base_concat['Actual Predictions Korean Lab Base'] == 1) &  (test_df_korean_lab_base_concat['label'] ==0), 'FP', test_df_korean_lab_base_concat['TP_FP'])


# Find FN
test_df_korean_lab_base_concat['TP_FP'] = np.where((test_df_korean_lab_base_concat['Actual Predictions Korean Lab Base'] == 0) &  (test_df_korean_lab_base_concat['label'] ==1), 'FN', test_df_korean_lab_base_concat['TP_FP'])


# Find TN
test_df_korean_lab_base_concat['TP_FP'] = np.where((test_df_korean_lab_base_concat['Actual Predictions Korean Lab Base'] == 0) &  (test_df_korean_lab_base_concat['label'] ==0), 'TN', test_df_korean_lab_base_concat['TP_FP'])

print(test_df_korean_lab_base_concat.head(20))


# Count TP
count_tp_korean_lab_base = test_df_korean_lab_base_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP", count_tp_korean_lab_base)


# Count FP
count_fp_korean_lab_base = test_df_korean_lab_base_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_korean_lab_base)

# Count FN
count_fn_korean_lab_base = test_df_korean_lab_base_concat['TP_FP'].value_counts()['FN']
print("Count of FN", count_fn_korean_lab_base)

# Count TN
count_tn_korean_lab_base = test_df_korean_lab_base_concat['TP_FP'].value_counts()['TN']
print("Count of TN", count_tn_korean_lab_base)

# Calculate Precision and Recall and F1 Score
precision_korean_lab_base = count_tp_korean_lab_base / (count_tp_korean_lab_base + count_fp_korean_lab_base)
print("precision", precision_korean_lab_base)

recall_korean_lab_base = count_tp_korean_lab_base / (count_tp_korean_lab_base  + count_fn_korean_lab_base)

print("recall", recall_korean_lab_base)
f1_score_korean_lab_base = (2 * precision_korean_lab_base * recall_korean_lab_base) / (precision_korean_lab_base  + recall_korean_lab_base)


print("F1 score is: ", f1_score_korean_lab_base )

                                             comments  label  \
0   팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0   
1                   왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0   
2                            여자들이 80프로잉 악플 남자는 여자욕 안해      1   
3                      설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0   
4                  다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1   
5                              광규형님 성국이형팬입니다 꼭좀 전해주세요      0   
6                    남자가 사회 생활하다 보면 손이 스칠수도 있는거지 거참~~      1   
7         여자도 출신이 의심스러움.사랑했던관계라기보다는남자발목잡으려고 그러는거같은데..      1   
8                                            ? 대세배우 ?      0   
9                                 개그맨 김경민 닮은다고 나만 느낌?      0   
10                                   중환자실이면 살아도 죽은것이여      0   
11  현실적으로 둘다 유명하지 않아서 수입도 불안정하고.. 그냥 헤어지자 해서 헤어지는 듯..      0   
12  이나영 보며 이쁘다..이쁘다..역시이쁘다..어쩜 저리 하나도 안변하고 아이낳고두 이...      0   
13  ㅋㅋ 역시 1화 방영후 말들이 많네 참고로 미스터션샤인도 1화 나오고 재미없다느니 ...      0   
14                                     이

# HAND CALCULATE F1 FOR LABSE BASELINE HYPERPARAMETERS FOR HINDI

In [None]:
# Hand check F1 score for Hindi Lab Model


# Load the Hindi predictions
test_df_hindi_lab_base_predicted = pd.read_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_lab_predictions.csv',
                       sep = ',', 
                       dtype={'predicted_test_vals':float}
                       )


# Turn predictions into 0 and 1
test_df_hindi_lab_base_predicted['Actual Predictions Hindi Lab Base'] = np.where(test_df_hindi_lab_base_predicted['predicted_test_vals'] < 0.5, 0,  1)

# Drop unnamed columns
test_df_hindi_lab_base_predicted = test_df_hindi_lab_base_predicted.drop(columns=['Unnamed: 0'], axis=1)


# Concatenate the correct test data values with predicted data values
test_df_hindi_lab_base_concat = pd.concat([test_df_hindi, test_df_hindi_lab_base_predicted], axis=1)


# Find TP
test_df_hindi_lab_base_concat['TP_FP'] = np.where((test_df_hindi_lab_base_concat['Actual Predictions Hindi Lab Base'] == test_df_hindi_lab_base_concat['label']) &  (test_df_hindi_lab_base_concat['label'] ==1), 'TP',  'unknown')


# Find FP
test_df_hindi_lab_base_concat['TP_FP'] = np.where((test_df_hindi_lab_base_concat['Actual Predictions Hindi Lab Base'] == 1) &  (test_df_hindi_lab_base_concat['label'] ==0), 'FP', test_df_hindi_lab_base_concat['TP_FP'])


# Find FN
test_df_hindi_lab_base_concat['TP_FP'] = np.where((test_df_hindi_lab_base_concat['Actual Predictions Hindi Lab Base'] == 0) &  (test_df_hindi_lab_base_concat['label'] ==1), 'FN', test_df_hindi_lab_base_concat['TP_FP'])


# Find TN
test_df_hindi_lab_base_concat['TP_FP'] = np.where((test_df_hindi_lab_base_concat['Actual Predictions Hindi Lab Base'] == 0) &  (test_df_hindi_lab_base_concat['label'] ==0), 'TN', test_df_hindi_lab_base_concat['TP_FP'])

print(test_df_hindi_lab_base_concat.head(20))


# Count TP
count_tp_hindi_lab_base = test_df_hindi_lab_base_concat['TP_FP'].value_counts()['TP']
print("\n\nCount of TP",count_tp_hindi_lab_base)


# Count FP
count_fp_hindi_lab_base = test_df_hindi_lab_base_concat['TP_FP'].value_counts()['FP']
print("Count of FP", count_fp_hindi_lab_base)

# Count FN
count_fn_hindi_lab_base = test_df_hindi_lab_base_concat['TP_FP'].value_counts()['FN']
print("Count of FN",count_fn_hindi_lab_base)

# Count TN
count_tn_hindi_lab_base = test_df_hindi_lab_base_concat['TP_FP'].value_counts()['TN']
print("Count of TN",count_tn_hindi_lab_base)

# Calculate Precision and Recall and F1 Score
precision_hindi_lab_base = count_tp_hindi_lab_base / (count_tp_hindi_lab_base + count_fp_hindi_lab_base)
print("precision", precision_hindi_lab_base)

recall_hindi_lab_base = count_tp_hindi_lab_base / (count_tp_hindi_lab_base  + count_fn_hindi_lab_base)

print("recall", recall_hindi_lab_base)
f1_score_hindi_lab_base = (2 * precision_hindi_lab_base * recall_hindi_lab_base) / (precision_hindi_lab_base  + recall_hindi_lab_base)


print("\n\nF1 score for Hindi Lab is: ", f1_score_hindi_lab_base )

                                             comments  label  \
0   @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0   
1   @China_Amb_India @narendramodi I am shocked th...      0   
2   कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0   
3   इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0   
4   ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1   
5   कबीर परमेश्वर जी ने अध्यात्म का विधान बताया है...      0   
6   1990 रिपीट हो चुका है,1 लाख से ज्यादा हिन्दू ल...      0   
7   "MODI MEANS TERROR"  #नरेंद्र_मोदी_ग्लोबल_पप्प...      1   
8                      @Asrathour_143 और हिंदी में 😜😜      0   
9   केंद्र में होते हुए भी, यदि आप कुछ नहीं कर पा ...      0   
10  पप्पू लाल  इस लिए हाथ दिखा रहा है वह जानता है ...      1   
11  @chitraaum बहन आपसे उम्मित हे आप इस विषय पर सं...      0   
12  RT @MdNadim_BGS: ₹21000Cr की लागत से बनी भारत ...      0   
13  It’s Time To Come Together To Save Farmers &am...      0   
14  @anandkalra69 @Vandana__Indian हिन्द

In [None]:
# Final concat of all results Hindi 

test_df_hindi_confusion = pd.concat([test_df_hindi, test_df_hindi_lab_base_concat, test_df_hindi_lab_concat, test_df_hindi_lab_cnn_concat], axis=1)

In [None]:
print(test_df_hindi_confusion.head())
test_df_hindi_confusion.to_csv('/content/gdrive/My Drive/266_datasets/standard_hindi_data/hindi_confusion.csv')

                                            comments  label  \
0  @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0   
1  @China_Amb_India @narendramodi I am shocked th...      0   
2  कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0   
3  इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0   
4  ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1   

                                            comments  label  \
0  @zishanAliRJD @iAnantSingh_ *ओसामा साहब ने सिर...      0   
1  @China_Amb_India @narendramodi I am shocked th...      0   
2  कल से 18 से ऊपर वालो को हवा की वैक्सीन लगेगी.....      0   
3  इधर की बात उधर करने में \nआज भी जीमेल  से आगे ...      0   
4  ☯️ मोदी जी ➡️ सोंगंद मुझे इस मिट्टी की मै देश ...      1   

   predicted_test_vals  Actual Predictions Hindi Lab Base TP_FP  \
0             0.005121                                  0    TN   
1             0.831543                                  1    FP   
2             0.001159                   

In [None]:
# Final concat of all results Korean

test_df_korean_confusion = pd.concat([test_df_korean, test_df_korean_lab_base_concat, test_df_korean_lab_concat, test_df_korean_lab_cnn_concat], axis=1)

In [None]:
print(test_df_korean_confusion.head())
test_df_korean_confusion.to_csv('/content/gdrive/My Drive/266_datasets/standard_korean_data/korean_confusion.csv')

                                            comments  label  \
0  팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0   
1                  왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0   
2                           여자들이 80프로잉 악플 남자는 여자욕 안해      1   
3                     설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0   
4                 다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1   

                                            comments  label  \
0  팀으로 데뷔한거면 개인활동 했어도 N빵 해야지... 그게 팀을 위해서도 맞는거고~~...      0   
1                  왕지혜 34살이지만 외모는 인정한다.여자라면 이정도는 되야지      0   
2                           여자들이 80프로잉 악플 남자는 여자욕 안해      1   
3                     설현이 떨고 있다... 아니겠지 아닐거야 그것만은 안돼      0   
4                 다된 기생충 잔치에재 뿌린 방가방송과 안현모 다신 나서지 말자      1   

   predicted_test_vals  Actual Predictions Korean Lab Base TP_FP  \
0             0.073717                                   0    TN   
1             0.971796                                   1    FP   
2             0.997700                