## https://wikidocs.net/22891

# 1. Spam Detection

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
import tensorflow as tf
from keras import optimizers
import matplotlib.pyplot as plt
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#  https://www.kaggle.com/uciml/sms-spam-collection-dataset
data = pd.read_csv(r'spam.csv', encoding='latin1')
print('sample number:',len(data))

In [None]:
data[:5]

In [None]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']
data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data[:5]

In [None]:
data.info()

In [None]:
# missing?
data.isnull().values.any()

In [None]:
# unique?
data['v2'].nunique()

In [None]:
# delete duplicate
data.drop_duplicates(subset=['v2'], inplace=True)
data.info()

In [None]:
numlist = list(range(len(data)))
data = data.set_index(pd.Index(numlist))
data = data[:1000]
data

In [None]:
data['v1'].value_counts()

In [None]:
data.drop(data[:824][data['v1'][:824] == 0].index, inplace=True)
numlist = list(range(len(data)))
data = data.set_index(pd.Index(numlist))
data

In [None]:
data['v1'].value_counts()

In [None]:
X = data['v2']
y = data['v1']
y = y.astype(float)

In [None]:
print(X)

In [None]:
# integer encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_encoded = tokenizer.texts_to_sequences(X)

In [None]:
# index (more frequent, smaller number given)
word_to_index = tokenizer.word_index
# print(len(word_to_index), word_to_index)

In [None]:
# word group for padding
vocab_size = len(word_to_index) + 1
print(vocab_size)

In [None]:
# the longest email
maxlen = 0
for i in range(len(X_encoded)):
    if len(X_encoded[i]) >= maxlen:
        maxlen = len(X_encoded[i])
print(maxlen)

In [None]:
# padding
max_len = maxlen
X_padded = pad_sequences(X_encoded, maxlen = max_len)
X_padded.shape

In [None]:
X = X_padded
X

In [None]:
y_hard = pd.DataFrame(y)
y_hard

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)

epochs = 10
batch = 64

from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM, GRU
from tensorflow.keras.models import Sequential
from keras import optimizers

embedding_dim = 32
hidden_units = 32

from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    # model.add(SimpleRNN(hidden_units))
    model.add(LSTM(hidden_units))
    # model.add(GRU(hidden_units))
    model.add(Dense(1, activation='sigmoid'))    
    return model

# 1-0. Generating Prob_labels

In [None]:
gen_model = create_model()   
gen_model.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.001), metrics=['accuracy'])
history = gen_model.fit(X, y_hard, validation_split=0.2, epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
plt.plot(history.history['loss'], label='loss')
plt.ylim([0, 1])
plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
plt.ylabel('Loss',fontweight="bold",fontsize = 15)
plt.title("Cost Function",fontweight="bold",fontsize = 20)
plt.legend()
plt.show()
result = gen_model.predict(X, verbose=0)
prob_label = list(result.reshape(len(X),))
y = pd.DataFrame(prob_label)
y

In [None]:
#
c = 0.5
y_edge = []
for i in range(len(y)):
    if list(y_hard['v1'])[i] == 0:
        if list(y[0])[i] <= c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
    if list(y_hard['v1'])[i] == 1:
        if list(y[0])[i] >= 1-c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
y_edge = pd.DataFrame(y_edge)
y_edge.value_counts()   

In [None]:
#
edge_list = list(y_edge[y_edge[0] == 2].index)
normal_list = list(y_edge[y_edge[0] == 0].index)
print(edge_list)

In [None]:
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
print(r, alpha)

# 1-1. Focal(Hard) and SLS(Hard/alpha)

In [None]:
#
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
B = [0.00, alpha]

for t in range(10):    # 10 times repeat      
    res = pd.DataFrame({'Focal':[0, 0, 0]}, index = ['Total','Edge','Normal']) 
    # Focal
    print('#'*50,'Focal','#'*50)
    list_total = []
    list_edge = []
    list_normal = []  
    focal_model = create_model()   

    n_iter = 0
    for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
        n_iter += 1
        X_train = X[train_index]
        y_train= y_hard.iloc[train_index]     # train with hard labels
        if n_iter == 1:
            print(y_train.value_counts())
        X_test = X[test_index]
        y_test= y_hard.iloc[test_index]     # test with hard labels
        test_edge_list = []
        for index in edge_list:
            if index in test_index:
                test_edge_list.append(index)
        X_test_edge = X[test_edge_list]
        y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
        test_normal_list = []
        for index in normal_list:
            if index in test_index:
                test_normal_list.append(index)
        X_test_normal = X[test_normal_list]
        y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

        X_train = np.array(X_train)
        y_train = np.array(y_train)
        y_train = y_train.astype(float)    
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_test = y_test.astype(float)
        X_test_edge = np.array(X_test_edge)
        y_test_edge = np.array(y_test_edge)
        y_test_edge = y_test_edge.astype(float)
        X_test_normal = np.array(X_test_normal)
        y_test_normal = np.array(y_test_normal)
        y_test_normal = y_test_normal.astype(float)

        focal_model.compile(loss='BinaryFocalCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.001), metrics=['accuracy'])
        history = focal_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#         plt.plot(history.history['loss'], label='loss')
#         plt.ylim([0, 1])
#         plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#         plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#         plt.title("Cost Function",fontweight="bold",fontsize = 20)
#         plt.legend()
#         plt.show()

        # TEST (total)
        predicted_total = np.round(focal_model.predict(X_test, verbose=0))
        list_total.append(metrics.accuracy_score(y_test, predicted_total))
        # TEST (edge)
        predicted_edge = np.round(focal_model.predict(X_test_edge, verbose=0))
        list_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
        # TEST (normal)
        predicted_normal = np.round(focal_model.predict(X_test_normal, verbose=0))
        list_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
            
    res['Focal'] = [np.mean(list_total), np.mean(list_edge), np.mean(list_normal)]
    print([np.mean(list_total), np.mean(list_edge), np.mean(list_normal)])
    
    for b in B:
        print('#'*50,'SLS',b,'#'*50)
        y_sls = []
        for i in range(len(y_hard)):
            if list(y_hard['v1'])[i] == 0:
                if prob_label[i] <= c:
                    y_sls.append(b)  # easy sample
                else:
                    y_sls.append(0) # hard sample
            if list(y_hard['v1'])[i] == 1:
                if prob_label[i] >= 1-c:
                    y_sls.append(1-b)  # easy sample
                else:
                    y_sls.append(1) # hard sample
        y_sls = pd.DataFrame(y_sls)       

        sls_total = []
        sls_edge = []
        sls_normal = []
        model_sls = create_model()

        n_iter = 0
        for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
            n_iter += 1
            X_train = X[train_index]
            y_sls_train = y_sls.iloc[train_index]     # train with sls labels
            if n_iter == 1:
                print(y_sls_train.value_counts())
            X_test = X[test_index]
            y_test= y_hard.iloc[test_index]     # test with hard labels
            test_edge_list = []
            for index in edge_list:
                if index in test_index:
                    test_edge_list.append(index)
            X_test_edge = X[test_edge_list]
            y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
            test_normal_list = []
            for index in normal_list:
                if index in test_index:
                    test_normal_list.append(index)
            X_test_normal = X[test_normal_list]
            y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

            X_train = np.array(X_train)
            y_sls_train = np.array(y_sls_train)
            y_sls_train = y_sls_train.astype(float)
            X_test = np.array(X_test)
            y_test = np.array(y_test)
            y_test = y_test.astype(float)
            X_test_edge = np.array(X_test_edge)
            y_test_edge = np.array(y_test_edge)
            y_test_edge = y_test_edge.astype(float)
            X_test_normal = np.array(X_test_normal)
            y_test_normal = np.array(y_test_normal)
            y_test_normal = y_test_normal.astype(float)

            # MLP_BCE(y_005)
            model_sls.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.001), metrics=['accuracy'])
            history = model_sls.fit(X_train, y_sls_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#             plt.plot(history.history['loss'], label='loss')
#             plt.ylim([0, 1])
#             plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#             plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#             plt.title("Cost Function",fontweight="bold",fontsize = 20)
#             plt.legend()
#             plt.show()
            
            # TEST (total)
            predicted_total = np.round(model_sls.predict(X_test, verbose=0))
            sls_total.append(metrics.accuracy_score(y_test, predicted_total))
            # TEST (edge)
            predicted_edge = np.round(model_sls.predict(X_test_edge, verbose=0))
            sls_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
            # TEST (normal)
            predicted_normal = np.round(model_sls.predict(X_test_normal, verbose=0))
            sls_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
                       
        res['SLS({})'.format(b)] = [np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)]
        print([np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)])         
    res.to_csv("RNN_SPAM_5CV(SLS_c0.5)_alphaimproved.csv", mode = 'a', float_format='%.4g')

# 2. Reuters News

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import reuters

In [None]:
vocab_size = 1000

(XX, yy), (X_dummy, y_dummy) = reuters.load_data(num_words=vocab_size, test_split=0)

print(len(XX))
print(len(X_dummy))
num_classes = len(set(yy))
print(num_classes)

In [None]:
print(XX[0])
print(yy[0])

In [None]:
word_to_index = reuters.get_word_index()
# print(len(word_to_index), word_to_index)

In [None]:
# word group for padding
# vocab_size = len(word_to_index) + 3
print(vocab_size)

In [None]:
# the longest news
maxlen = 0
for i in range(len(XX)):
    if len(XX[i]) >= maxlen:
        maxlen = len(XX[i])
        
# maxlen = 100
print(maxlen)

In [None]:
# padding
max_len = maxlen
X_padded = pad_sequences(XX, maxlen = max_len)
X_padded.shape

In [None]:
pd.Series(yy).value_counts()

In [None]:
# Picking only label 3->'0' & 1->'1'
idx_4 = []
idx_3 = []
for i in range(len(yy)):
    if list(yy)[i] == 4:
        idx_4.append(i)
    if list(yy)[i] == 3:
        idx_3.append(i)
print(len(idx_4), len(idx_3))

In [None]:
idx = idx_4[:150] + idx_3[:150]
idx.sort()

In [None]:
X = []
y = []
for i in idx:
    X.append(X_padded[i])
    y.append(yy[i])
X = np.array(X)
y = np.array(y)
print(X.shape, y.shape)

In [None]:
pd.Series(y).value_counts()

In [None]:
y_hard = [0.00 if x==3 else x for x in y]
y_hard = [1.00 if x==4 else x for x in y_hard]
pd.Series(y_hard).value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)

epochs = 10
batch = 64

from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM, GRU
from tensorflow.keras.models import Sequential
from keras import optimizers

embedding_dim = 32
hidden_units = 32

from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    # model.add(SimpleRNN(hidden_units))
    model.add(LSTM(hidden_units))
    # model.add(GRU(hidden_units))
    model.add(Dense(1, activation='sigmoid'))    
    return model

In [None]:
X

In [None]:
y_hard = pd.DataFrame(y_hard)
y_hard

# 2-0. Generating Prob_labels

In [None]:
gen_model = create_model()   
gen_model.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
history = gen_model.fit(X, y_hard, validation_split=0.2, epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
plt.plot(history.history['loss'], label='loss')
plt.ylim([0, 1])
plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
plt.ylabel('Loss',fontweight="bold",fontsize = 15)
plt.title("Cost Function",fontweight="bold",fontsize = 20)
plt.legend()
plt.show()
result = gen_model.predict(X, verbose=0)
prob_label = list(result.reshape(len(X),))
y = pd.DataFrame(prob_label)
y

In [None]:
#
c = 0.5
y_edge = []
for i in range(len(y)):
    if list(y_hard[0])[i] == 0:
        if list(y[0])[i] <= c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
    if list(y_hard[0])[i] == 1:
        if list(y[0])[i] >= 1-c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
y_edge = pd.DataFrame(y_edge)
y_edge.value_counts()   

In [None]:
#
edge_list = list(y_edge[y_edge[0] == 2].index)
normal_list = list(y_edge[y_edge[0] == 0].index)
print(edge_list)

In [None]:
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
print(r, alpha)

# 2-1. Focal(Hard) and SLS(Hard/alpha)

In [None]:
#
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
B = [0.00, alpha]

for t in range(10):    # 10 times repeat      
    res = pd.DataFrame({'Focal':[0, 0, 0]}, index = ['Total','Edge','Normal']) 
    # Focal
    print('#'*50,'Focal','#'*50)
    list_total = []
    list_edge = []
    list_normal = []  
    focal_model = create_model()   

    n_iter = 0
    for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
        n_iter += 1
        X_train = X[train_index]
        y_train= y_hard.iloc[train_index]     # train with hard labels
        if n_iter == 1:
            print(y_train.value_counts())
        X_test = X[test_index]
        y_test= y_hard.iloc[test_index]     # test with hard labels
        test_edge_list = []
        for index in edge_list:
            if index in test_index:
                test_edge_list.append(index)
        X_test_edge = X[test_edge_list]
        y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
        test_normal_list = []
        for index in normal_list:
            if index in test_index:
                test_normal_list.append(index)
        X_test_normal = X[test_normal_list]
        y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

        X_train = np.array(X_train)
        y_train = np.array(y_train)
        y_train = y_train.astype(float)    
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_test = y_test.astype(float)
        X_test_edge = np.array(X_test_edge)
        y_test_edge = np.array(y_test_edge)
        y_test_edge = y_test_edge.astype(float)
        X_test_normal = np.array(X_test_normal)
        y_test_normal = np.array(y_test_normal)
        y_test_normal = y_test_normal.astype(float)

        focal_model.compile(loss='BinaryFocalCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
        history = focal_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#         plt.plot(history.history['loss'], label='loss')
#         plt.ylim([0, 1])
#         plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#         plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#         plt.title("Cost Function",fontweight="bold",fontsize = 20)
#         plt.legend()
#         plt.show()

        # TEST (total)
        predicted_total = np.round(focal_model.predict(X_test, verbose=0))
        list_total.append(metrics.accuracy_score(y_test, predicted_total))
        # TEST (edge)
        predicted_edge = np.round(focal_model.predict(X_test_edge, verbose=0))
        list_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
        # TEST (normal)
        predicted_normal = np.round(focal_model.predict(X_test_normal, verbose=0))
        list_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
            
    res['Focal'] = [np.mean(list_total), np.mean(list_edge), np.mean(list_normal)]
    print([np.mean(list_total), np.mean(list_edge), np.mean(list_normal)])
    
    for b in B:
        print('#'*50,'SLS',b,'#'*50)
        y_sls = []
        for i in range(len(y_hard)):
            if list(y_hard[0])[i] == 0:
                if prob_label[i] <= c:
                    y_sls.append(b)  # easy sample
                else:
                    y_sls.append(0) # hard sample
            if list(y_hard[0])[i] == 1:
                if prob_label[i] >= 1-c:
                    y_sls.append(1-b)  # easy sample
                else:
                    y_sls.append(1) # hard sample
        y_sls = pd.DataFrame(y_sls)       

        sls_total = []
        sls_edge = []
        sls_normal = []
        model_sls = create_model()

        n_iter = 0
        for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
            n_iter += 1
            X_train = X[train_index]
            y_sls_train = y_sls.iloc[train_index]     # train with sls labels
            if n_iter == 1:
                print(y_sls_train.value_counts())
            X_test = X[test_index]
            y_test= y_hard.iloc[test_index]     # test with hard labels
            test_edge_list = []
            for index in edge_list:
                if index in test_index:
                    test_edge_list.append(index)
            X_test_edge = X[test_edge_list]
            y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
            test_normal_list = []
            for index in normal_list:
                if index in test_index:
                    test_normal_list.append(index)
            X_test_normal = X[test_normal_list]
            y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

            X_train = np.array(X_train)
            y_sls_train = np.array(y_sls_train)
            y_sls_train = y_sls_train.astype(float)
            X_test = np.array(X_test)
            y_test = np.array(y_test)
            y_test = y_test.astype(float)
            X_test_edge = np.array(X_test_edge)
            y_test_edge = np.array(y_test_edge)
            y_test_edge = y_test_edge.astype(float)
            X_test_normal = np.array(X_test_normal)
            y_test_normal = np.array(y_test_normal)
            y_test_normal = y_test_normal.astype(float)

            # MLP_BCE(y_005)
            model_sls.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
            history = model_sls.fit(X_train, y_sls_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#             plt.plot(history.history['loss'], label='loss')
#             plt.ylim([0, 1])
#             plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#             plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#             plt.title("Cost Function",fontweight="bold",fontsize = 20)
#             plt.legend()
#             plt.show()
            
            # TEST (total)
            predicted_total = np.round(model_sls.predict(X_test, verbose=0))
            sls_total.append(metrics.accuracy_score(y_test, predicted_total))
            # TEST (edge)
            predicted_edge = np.round(model_sls.predict(X_test_edge, verbose=0))
            sls_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
            # TEST (normal)
            predicted_normal = np.round(model_sls.predict(X_test_normal, verbose=0))
            sls_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
                       
        res['SLS({})'.format(b)] = [np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)]
        print([np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)])         
    res.to_csv("RNN_RNEWS_5CV(SLS_c0.5)_alphaimproved.csv.csv", mode = 'a', float_format='%.4g')

# IMDB Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb

In [None]:
vocab_size = 1000
max_len = 100

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [None]:
print(len(X_train), len(y_train))

In [None]:
pd.Series(y_train).value_counts()

In [None]:
# Making lists of index
idx_0 = []
idx_1 = []
for i in range(len(y_train)):
    if list(y_train)[i] == 0:
        idx_0.append(i)
    if list(y_train)[i] == 1:
        idx_1.append(i)
print(len(idx_0), len(idx_1))

In [None]:
idx = idx_0[:150] + idx_1[:150]
idx.sort()
# print(len(idx), idx)

In [None]:
X = []
y = []
for i in idx:
    X.append(X_train[i])
    y.append(y_train[i])
# X = np.array(X)
# y = np.array(y)
# print(X.shape, y.shape)

In [None]:
print(pd.Series(y).value_counts())

In [None]:
word_to_index = imdb.get_word_index()
# print(len(word_to_index), word_to_index)

In [None]:
# padding
X_padded = pad_sequences(X, maxlen = max_len)
X_padded.shape

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)

epochs = 10
batch = 64

from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM, GRU
from tensorflow.keras.models import Sequential
from keras import optimizers

embedding_dim = 32
hidden_units = 32

from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    # model.add(SimpleRNN(hidden_units))
    model.add(LSTM(hidden_units))
    # model.add(GRU(hidden_units))
    model.add(Dense(1, activation='sigmoid'))    
    return model

In [None]:
X = X_padded
X

In [None]:
y_hard = pd.DataFrame(y_hard)
y_hard

# 3-0. Generating Prob_labels

In [None]:
gen_model = create_model()   
gen_model.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
history = gen_model.fit(X, y_hard, validation_split=0.2, epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
plt.plot(history.history['loss'], label='loss')
plt.ylim([0, 1])
plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
plt.ylabel('Loss',fontweight="bold",fontsize = 15)
plt.title("Cost Function",fontweight="bold",fontsize = 20)
plt.legend()
plt.show()
result = gen_model.predict(X, verbose=0)
prob_label = list(result.reshape(len(X),))
y = pd.DataFrame(prob_label)
y

In [None]:
#
c = 0.5
y_edge = []
for i in range(len(y)):
    if list(y_hard[0])[i] == 0:
        if list(y[0])[i] <= c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
    if list(y_hard[0])[i] == 1:
        if list(y[0])[i] >= 1-c:
            y_edge.append(0)  # easy sample
        else:
            y_edge.append(2) # hard sample
y_edge = pd.DataFrame(y_edge)
y_edge.value_counts()   

In [None]:
#
edge_list = list(y_edge[y_edge[0] == 2].index)
normal_list = list(y_edge[y_edge[0] == 0].index)
print(edge_list)

In [None]:
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
print(r, alpha)

# 3-1. Focal(Hard) and SLS(Hard/alpha)

In [None]:
#
r = list(y_edge[0]).count(0)/list(y_edge[0]).count(2)   # normal/edge
alpha = (r-1)/(2*r)
B = [0.00, alpha]

for t in range(10):    # 10 times repeat      
    res = pd.DataFrame({'Focal':[0, 0, 0]}, index = ['Total','Edge','Normal']) 
    # Focal
    print('#'*50,'Focal','#'*50)
    list_total = []
    list_edge = []
    list_normal = []  
    focal_model = create_model()   

    n_iter = 0
    for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
        n_iter += 1
        X_train = X[train_index]
        y_train= y_hard.iloc[train_index]     # train with hard labels
        if n_iter == 1:
            print(y_train.value_counts())
        X_test = X[test_index]
        y_test= y_hard.iloc[test_index]     # test with hard labels
        test_edge_list = []
        for index in edge_list:
            if index in test_index:
                test_edge_list.append(index)
        X_test_edge = X[test_edge_list]
        y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
        test_normal_list = []
        for index in normal_list:
            if index in test_index:
                test_normal_list.append(index)
        X_test_normal = X[test_normal_list]
        y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

        X_train = np.array(X_train)
        y_train = np.array(y_train)
        y_train = y_train.astype(float)    
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_test = y_test.astype(float)
        X_test_edge = np.array(X_test_edge)
        y_test_edge = np.array(y_test_edge)
        y_test_edge = y_test_edge.astype(float)
        X_test_normal = np.array(X_test_normal)
        y_test_normal = np.array(y_test_normal)
        y_test_normal = y_test_normal.astype(float)

        focal_model.compile(loss='BinaryFocalCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
        history = focal_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#         plt.plot(history.history['loss'], label='loss')
#         plt.ylim([0, 1])
#         plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#         plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#         plt.title("Cost Function",fontweight="bold",fontsize = 20)
#         plt.legend()
#         plt.show()

        # TEST (total)
        predicted_total = np.round(focal_model.predict(X_test, verbose=0))
        list_total.append(metrics.accuracy_score(y_test, predicted_total))
        # TEST (edge)
        predicted_edge = np.round(focal_model.predict(X_test_edge, verbose=0))
        list_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
        # TEST (normal)
        predicted_normal = np.round(focal_model.predict(X_test_normal, verbose=0))
        list_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
            
    res['Focal'] = [np.mean(list_total), np.mean(list_edge), np.mean(list_normal)]
    print([np.mean(list_total), np.mean(list_edge), np.mean(list_normal)])
    
    for b in B:
        print('#'*50,'SLS',b,'#'*50)
        y_sls = []
        for i in range(len(y_hard)):
            if list(y_hard[0])[i] == 0:
                if prob_label[i] <= c:
                    y_sls.append(b)  # easy sample
                else:
                    y_sls.append(0) # hard sample
            if list(y_hard[0])[i] == 1:
                if prob_label[i] >= 1-c:
                    y_sls.append(1-b)  # easy sample
                else:
                    y_sls.append(1) # hard sample
        y_sls = pd.DataFrame(y_sls)       

        sls_total = []
        sls_edge = []
        sls_normal = []
        model_sls = create_model()

        n_iter = 0
        for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
            n_iter += 1
            X_train = X[train_index]
            y_sls_train = y_sls.iloc[train_index]     # train with sls labels
            if n_iter == 1:
                print(y_sls_train.value_counts())
            X_test = X[test_index]
            y_test= y_hard.iloc[test_index]     # test with hard labels
            test_edge_list = []
            for index in edge_list:
                if index in test_index:
                    test_edge_list.append(index)
            X_test_edge = X[test_edge_list]
            y_test_edge = y_hard.iloc[test_edge_list]     # test with hard labels
            test_normal_list = []
            for index in normal_list:
                if index in test_index:
                    test_normal_list.append(index)
            X_test_normal = X[test_normal_list]
            y_test_normal = y_hard.iloc[test_normal_list]     # test with hard labels

            X_train = np.array(X_train)
            y_sls_train = np.array(y_sls_train)
            y_sls_train = y_sls_train.astype(float)
            X_test = np.array(X_test)
            y_test = np.array(y_test)
            y_test = y_test.astype(float)
            X_test_edge = np.array(X_test_edge)
            y_test_edge = np.array(y_test_edge)
            y_test_edge = y_test_edge.astype(float)
            X_test_normal = np.array(X_test_normal)
            y_test_normal = np.array(y_test_normal)
            y_test_normal = y_test_normal.astype(float)

            # MLP_BCE(y_005)
            model_sls.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.005), metrics=['accuracy'])
            history = model_sls.fit(X_train, y_sls_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch)#, callbacks=[early_stopping])
#             plt.plot(history.history['loss'], label='loss')
#             plt.ylim([0, 1])
#             plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#             plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#             plt.title("Cost Function",fontweight="bold",fontsize = 20)
#             plt.legend()
#             plt.show()
            
            # TEST (total)
            predicted_total = np.round(model_sls.predict(X_test, verbose=0))
            sls_total.append(metrics.accuracy_score(y_test, predicted_total))
            # TEST (edge)
            predicted_edge = np.round(model_sls.predict(X_test_edge, verbose=0))
            sls_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
            # TEST (normal)
            predicted_normal = np.round(model_sls.predict(X_test_normal, verbose=0))
            sls_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
                       
        res['SLS({})'.format(b)] = [np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)]
        print([np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)])         
    res.to_csv("RNN_IMDB_5CV(SLS_c0.5)_alphaimproved.csv", mode = 'a', float_format='%.4g')