In [1]:
import numpy as np
import pandas as pd
import spacy

from statistics import mean

from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.data import Sentence

from sklearn import preprocessing

from keras.layers import Input, Dense, GRU, LSTM, Bidirectional, Flatten
from keras.optimizers import Adam
from keras.models import Model

In [2]:
# Encode ss as 1, sp as 0, sx as 2
df1 = pd.read_csv('./data/Data_Modified.csv')
df2 = pd.read_csv('./data/Data2_Modified.csv')
del df1['Unnamed: 0']
del df2['Unnamed: 0']
le = preprocessing.LabelEncoder()
#df1['Coding'] = le.fit_transform(df1['Coding'])
frames = [df1, df2]
df = pd.concat(frames)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

le = preprocessing.LabelEncoder()
df['Coding_Modified'] = df['Coding']
df['Coding_Modified'] = le.fit_transform(df['Coding_Modified'])
df_train = df[0:90]
df_test = df[90:100]
df_train
df_test

Unnamed: 0,Text,Coding,Coding_Modified
90,緊張 する と 呼吸 が 自然 に 早く なっ て たくさん な 考え が 頭 の 中 に ...,sp,0
91,目 を 閉じる と すぐ 眠く なっ て 色んな こと を み て なん か 疲れ が 感じ...,sp,0
92,メタ 認知 的 な 感じ です ね 自分 が 考え て いる ん だ と 分かっ て いる ...,sx,2
93,確か に 食べる の が 早 すぎ まし た 胃 が 重く て しんどかっ た です,sp,0
94,私 は データ 系 の 会社 に 通っ て ます,ss,1
95,怒り と 悲し み が 伴っ て いる こと に 気づき まし た,sx,2
96,戻っ て き たら 先 妄想 を し て い た と 意識 し まし た また か と いう...,sp,0
97,今 気づき まし た,sp,0
98,人々 が 私 の 目 の 前 に 行き かっ て 何 の 関与 も せ ず に 距離 を と...,sx,2
99,私 は 自分 の 感情 に 小さな 個性 を 持た せ たり と か し た こと が あり ます,sx,2


In [3]:
# This part should never be ran again!!!
#delete_pos = ["PUNCT", "SPACE", "SYM"]

#nlp = spacy.load("ja_core_news_sm")

#df2 = pd.read_csv('./data/Data2.csv')

#def clean_text(sentence):
#    '''Clean all irrelavent tokens in the input sentence'''
#    doc = nlp(sentence)
#    word_list = [str(token) for token in doc if token.pos_ not in delete_pos]
#    return ' '.join(word_list)

#df2["Text"] = df2["Text"].apply(clean_text)
#del df2['Unnamed: 2']
#df2.to_csv("./data/Data2_Modified.csv")


In [9]:
ja_embedding = WordEmbeddings('ja-crawl')
ja_forward_embedding = FlairEmbeddings('ja-forward')
ja_backward_embedding = FlairEmbeddings('ja-backward')

stacked_embedding = StackedEmbeddings([
    ja_embedding,
    ja_forward_embedding,
    ja_backward_embedding
])

In [5]:
def generateTrainingData(dataset, batch_size, max_length, num_classes, emb_size):
    x_batch = []
    y_batch = []
    while True:
        data = dataset.sample(frac=1)
        for index, row in data.iterrows():
            my_sent = row['Text']
            sentence = Sentence(my_sent)
            stacked_embedding.embed(sentence)
            x = []
            for token in sentence:
                x.append(token.embedding.cpu().detach().numpy())
                if len(x) == max_length:
                    break
        
            while len(x) < max_length:
                x.append(np.zeros(emb_size))
                
            y = np.zeros(num_classes)
            y[row["Coding_Modified"]] = 1
            
            x_batch.append(x)            
            y_batch.append(y)

            if len(y_batch) == batch_size:
                yield np.array(x_batch), np.array(y_batch)
                x_batch = []
                y_batch = []

In [6]:
def declare_model(dataset, batch_size, max_len, emb_size, gru_size, num_classes):
    sample = Input(batch_shape=(batch_size, max_len, emb_size))
    gru_out = Bidirectional(GRU(gru_size, return_sequences=True))(sample)
    #lstm_out = Bidirectional(LSTM(gru_size, return_sequences=True))(sample)
    gru_out = Flatten()(gru_out)
    #lstm_out = Flatten()(lstm_out)
    predictions = Dense(num_classes, activation='sigmoid')(gru_out)

    model = Model(inputs=sample, outputs=[predictions])
    model.compile(optimizer=Adam(),loss='categorical_crossentropy', metrics=["acc"])
    print(model.summary())
    
    return model

In [None]:
m = declare_model(dataset=df_train, batch_size=1, max_len=17, emb_size=4396, gru_size=20, num_classes=3)
gen = generateTrainingData(dataset=df_train, batch_size=1, max_length=17, num_classes=3, emb_size=4396)
m.fit(gen, steps_per_epoch=90, epochs=10, max_queue_size=10, workers=1)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(1, 17, 4396)]           0         
_________________________________________________________________
bidirectional (Bidirectional (1, 17, 40)               530160    
_________________________________________________________________
flatten (Flatten)            (1, 680)                  0         
_________________________________________________________________
dense (Dense)                (1, 3)                    2043      
Total params: 532,203
Trainable params: 532,203
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10

In [None]:
def generatePredictionData(dataset, batch_size, max_length, num_classes, emb_size):
    x_batch = []
    while True:
        for text in dataset:
            my_sent = text
            sentence = Sentence(my_sent)
            stacked_embedding.embed(sentence)
        
            x = []
            for token in sentence:
                x.append(token.embedding.cpu().detach().numpy())
                if len(x) == max_length:
                    break
            while len(x) < max_length:
                x.append(np.zeros(emb_size))
          
            x_batch.append(x)            
            if len(x_batch) == batch_size:
                yield np.array(x_batch)
                
                x_batch = []

In [None]:
#test_sentences = [
#    '私深く考えるタイプじゃないです',
#    '今へたへたです',
#    '言葉遣いは丁寧であるように厳しく言われてきました'
#]
#nlp = spacy.load("ja_core_news_sm")
#test_sentences_modified = []
##stacked_embedding.embed(sentence)
#for i in range(3):
#    stuff = []
#    sentence = nlp(test_sentences[i])
#    for token in sentence:
#        stuff.append(str(token))
#    test_sentences_modified.append(' '.join(stuff))   

In [None]:
test_sentences = []
test_results = []
for i in range(90, 100):
    test_sentences.append(df_test['Text'][i])
    test_results.append(df_test['Coding_Modified'][i])
test_sentences

In [None]:
gen = generatePredictionData(dataset=test_sentences, batch_size=1, max_length=17, num_classes=3, emb_size=4396)
predict = np.argmax(m.predict(gen, steps=10), axis=1)
acc = sum([1 for a, b in zip(predict, test_results) if a==b])
print(predict)
print(acc/10)