In [None]:

import ast
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd
from tqdm import tqdm

import tensorflow as tf

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, concatenate
from keras.models import Model
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Klassifisering av tekster med et LSTM-nettverk


### Hente fritekstsvarene fra Toppoppgaver og de annoterte tekstene

In [None]:
def convert_to_matrix(df):
    labels = set()
    for row in df[["label","index"]].values:
        label = row[0]
        for l in label:
            labels.add(l)

    # Creating matrix of labels
    labels = list(labels)
    for label in labels:
        df[label] = 0

    col_inds = df.columns.values
    for index,val in enumerate(df[["label"]].values):
        labels = val[0]
        for label in labels:
            col_ind = np.where(col_inds == label)[0]
            df.iloc[index,col_ind] = 1
    
    df.drop(["Date Submitted","Hva kom du hit for å gjøre", "raw_text","label","Hvor lang tid brukte du?", "Fikk du gjort det du kom hit for å gjøre"], axis = 1, inplace = True)
    return df

In [None]:
data_path = "../hjemmesnekk/labeled_data/"


# for tilgang til disse csv filene trengs tilgang til kubeflow-serveren vår.
csv_path = "toppoppgaver.csv" 
labeled_csv_path = "toppoppgaver_NYESTE.csv"

labels = pd.read_csv(data_path + "toppoppgaver_NYESTE.csv")
labels['label'] = labels['label'].apply(lambda x: ast.literal_eval(x))
labels = convert_to_matrix(labels)
labels = labels.set_index("index")
    
#topp = text_processer(TOPPOPPGAVER)
#df = topp.get_preprocessed_data()
df = pd.read_csv(data_path + csv_path)
df = df.loc[labels.index.values,:]
df.sort_index(axis = 0, inplace = True)
labels.sort_index(axis = 0, inplace = True)
labels_intent.sort_index(axis = 0, inplace = True)
labels_bakgrunn.sort_index(axis = 0, inplace = True)

# lemmatiserer og naivt retter skrivefeil
df["raw_text"] = df["raw_text"].apply(lemmatisering)

In [None]:
texts = df["raw_text"]
classes = labels.columns.values
y = labels[classes].values
# deler opp i trenings- og testsett
texts_train, texts_test, y_train, y_test = train_test_split(texts, y, test_size=0.2, random_state=10)

### Tokeniserer og lager embeddingmatrise

glove-modellen er trent opp på tekster fra skriv-til-oss og fra Hotjar, og er tilgjengelig på kubeflow

In [None]:
embed_dim = 100
vocab_size = 500
maxlen = 100

In [None]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(list(texts_train))
tokenized_train = tokenizer.texts_to_sequences(texts_train)
tokenized_test = tokenizer.texts_to_sequences(texts_test)
X_train = pad_sequences(tokenized_train, maxlen=maxlen)
X_test = pad_sequences(tokenized_test, maxlen=maxlen)


In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(glove_path+'vectors_100d.txt'))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [None]:
word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

### Bygge LSTM-nettverket og trene det

In [None]:
def create_LSTM_text_model(maxlen = maxlen, embed_dim = embed_dim, nb_word = nb_words, linear_layers = 3, linear_nodes = 200, linear_dropout = 0.1):
    inp = Input(shape = (maxlen,))
    x = Embedding(nb_words, embed_dim, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    if linear_layers > 0:
        for layer in range(linear_layers):
            x = Dense(linear_nodes, activation="relu")(x)
            x = Dropout(linear_dropout)(x)
    x = Dense(len(classes), activation = "sigmoid")(x)
    model = Model(inputs=inp, outputs=x, name = "lstm")
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=[tf.keras.metrics.TopKCategoricalAccuracy(k=3)])
    return model

model = create_LSTM_text_model()
model.summary()

In [None]:
cbs = [tf.keras.callbacks.EarlyStopping(monitor = "loss", patience=5)]
history = model.fit(X_train, y_train,batch_size=16, epochs=200, verbose = 2, validation_split=0.1, callbacks = cbs);


In [None]:
#Lagre modellen
model_path = '../models/NN/'

model_json = model.to_json()
from keras.models import model_from_json
with open(model_path+"model_80f.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(model_path+"model_80f.h5")
print("Saved model to disk")
json_file = open(model_path+'model_80f.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(model_path+"model_80f.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.TopKCategoricalAccuracy(k=4)])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

### Metrics

In [None]:
y_predict = model.predict([X_test], batch_size=1024, verbose=1)
print(metrics.classification_report(y_test, y_pred))
# her er klasser med null i score tatt med, noe som betydelig senker snittet, derfor regner vi det ut selv hvor vi fjerner disse

In [None]:
y_pred = np.array([[1 if j>0.5 else 0 for j in i]for i in y_predict])
mcm = multilabel_confusion_matrix(y_test, y_pred, samplewise=False)
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]
recall = tp / (tp + fn)
precision = tn / (tn + fp) 
f1_score = [2 * (precision[i] * recall[i]) / (recall[i] + precision[i]) for i in range(len(classes))]
unike = np.unique([i[1] for i in np.argwhere(y_test==1)], return_counts=True)
indekser = list(unike[0])
antall = list(unike[1])
for i in range(len(classes)):
    if i != indekser[i]:
        indekser.insert(i,i)
        antall.insert(i,0)
        

df_f1score = pd.DataFrame([np.round(recall, 2), np.round(precision, 2), map(str, antall), f1_score],index=["Recall", "Precision", "Antall", "f1_score"], columns=classes).T                                                                                                            
s = [i for i in df_f1score.f1_score if i > 0]
print(sum(s)/len(s))
df_f1score

### Klassifisere nye tekster

In [None]:

df = pd.read_csv(data_path + csv_path)
df["raw_text"] = df["raw_text"].apply(lemmatisering)
texts_to_predict = df["raw_text"].values
print(texts_to_predict)
tokenized_texts = tokenizer.texts_to_sequences(texts_to_predict)
X = pad_sequences(tokenized_texts, maxlen=maxlen)
predicted = model.predict(X)

In [None]:
threshold = 0.7
predicted[predicted > threshold] = 1
predicted[predicted < threshold] = 0
predicted_labels = []
for i in range(predicted.shape[0]):
    temp = []
    for j in range(predicted.shape[1]):
         if predicted[i,j] == 1: temp.append(classes[j])
    predicted_labels.append(temp)
df["Prediction"] = predicted_labels
df.to_pickle("../data/pickle/predicted_toppoppgaver/pred.pkl")