In [1]:
import os, time, random
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
data_path = "../feature_and_vector_seq"
file_list = os.listdir(data_path)
len(file_list)

Using TensorFlow backend.


13157

In [2]:
def substitute_cyrillic():
    dialogs_list = []
    data_path = "../feature_and_vector_seq"
    file_list = os.listdir(data_path)
    for file_name in file_list[0:]:
        file_and_path = os.path.join(data_path, file_name)
        try:
            dialogs_list.append(pd.read_hdf(file_and_path)['Intent analysis'].values)
        except Exception as e:
            print('file_and_path',file_and_path)
            ss=['-Шарова', '-шарова',
                '-Набокова', '-Ким',
                '-Плотникова', '-Тимофеева']
            su=['-Sharova', '-Sharova',
                '-Nabokova', '-Kim',
                '-Plotnikova', '-Timofeeva']
            for i, si in enumerate(ss):
                pos = file_and_path.find(si)
                if pos > -1:
                    new_file_and_path = file_and_path[:pos]+su[i]+file_and_path[pos+len(si):]
                    os.rename(file_and_path, new_file_and_path)
    #                 print('new_file_and_path',new_file_and_path)

    len(dialogs_list), len(file_list)

In [3]:
class SequenceGenerator():
    def __init__(self, data_path, intent_index, max_sequence_length, validation_split):
        self.data_path = data_path
        self.max_sequence_length = max_sequence_length
        self.intent_index = intent_index
        self.num_intents = max(intent_index.values()) + 1
        self.file_list_train, self.file_list_test = self._split(validation_split)
        
    def _file2sequence(self, file_and_path):
        sequence = []
        for intent in pd.read_hdf(file_and_path, engine="python", encoding='cp1251')['Intent analysis']:
            if intent:
                intent_char = intent[0].lower()
            else:
                intent_char = ""
            sequence.append(self.intent_index[intent_char])
        return sequence
    
    def _split(self, validation_split, random_state = None):
        file_list = os.listdir(self.data_path)
        file_list_train, file_list_test = train_test_split(file_list,  test_size=validation_split, random_state=random_state)
        return file_list_train, file_list_test
        
    def __build_intent_sequence(self, dialogs_list):
        sequence_list = []
        for dialog in dialogs_list:
            sequence = []
            for intent in dialog['Intent analysis'].values:
                sequence.append(self.intent_index[intent])
            sequence_list.append(sequence)
        paded_sequences = pad_sequences(sequence_list, maxlen=self.max_sequence_length)
        return paded_sequences
    
    def generate_batch(self, batch_size, subset='training'):
        if subset == 'training':
            file_list = self.file_list_train
        elif subset == 'validation':
            file_list = self.file_list_test
        f_i = 0

        while True:
            i = 0
            sequence_batch = []
            while i < batch_size:
                if f_i == len(file_list):
                    f_i = 0
                    random.shuffle(file_list)
                file_and_path = os.path.join(self.data_path, file_list[f_i])
                sequence = self._file2sequence(file_and_path)
                if len(sequence) > self.max_sequence_length:
                    for ii in range(len(sequence) - self.max_sequence_length + 1):
                        sequence_i = sequence[ii:self.max_sequence_length+ii]
                        sequence_batch.append(sequence_i)
                        i += 1
                else:
                    sequence_i = sequence
                    sequence_batch.append(sequence_i)
                    i += 1
                f_i+=1
            paded_sequences = pad_sequences(sequence_batch, maxlen=self.max_sequence_length)
            matrix = paded_sequences[:,:-1]
            labels = to_categorical(paded_sequences[:,-1:],num_classes=self.num_intents)
            yield matrix, labels

class PredictIntent():
    intent_index = {"":0, " ":0, "а":1, 'a':1, "б":2, "в":3, "г":4, "д":5,
                "е":6, "e":6,  "ж":7,  "з":8,  "3":8,  "и":9,  "к":10,
                "л":11, "м":12, "н":13, "о":14, "п":15,
                "р":16, "с":17, "т":18, "у":19, "ф":20,
                "х":21, "ц":22, "ч":23, "ш":24, "щ":25}
    batch_size = 131
    max_sequence_length = 5
    num_intents = max(intent_index.values()) + 1
    intent_embedding_dim = 10
    data_path = "../feature_and_vector_seq"
            
            
    def build_CNN_model(self):
        embedding_layer = Embedding(self.num_intents,
                            self.intent_embedding_dim,
                            input_length=self.max_sequence_length - 1,
                            trainable=True)
        
        sequence_input = Input(shape=(self.max_sequence_length - 1,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)
        x = Conv1D(128, 2, activation='relu')(embedded_sequences)
        x = MaxPooling1D(1)(x)
        x = Conv1D(128, 2, activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(self.num_intents, activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        self.model = model
        return self.model
    
    def fit_generator(self, validation_split):
        sg = SequenceGenerator(self.data_path, self.intent_index, self.max_sequence_length, validation_split)
        
        self.model.fit_generator(
            generator = sg.generate_batch(self.batch_size, subset='training'),
            steps_per_epoch = len(os.listdir(self.data_path))*(1-validation_split)//self.batch_size,
            epochs = 30,
            validation_data = sg.generate_batch(self.batch_size, subset='validation'),
            validation_steps = len(os.listdir(self.data_path))*validation_split//self.batch_size
        )

pi = PredictIntent()

In [4]:
pi.build_CNN_model()
pi.fit_generator(validation_split = 0.2)
# 1 - loss: 2.5108 - acc: 0.2834
# 13100 30 - loss: 2.9677 - acc: 0.1152
# 1310  30 - loss: 2.6313 - acc: 0.2464
# 131   30 - loss: 1.8456 - acc: 0.4678
# 13    30 - loss: 2.1301 - acc: 0.4263

In [125]:
pd.read_hdf(r'C:\\Users\\Nikolay\\Desktop\\coll\\8-Nabokova-comm_rosbalt_39_79701_output_10.h5')

Unnamed: 0,ID of comment,ID of post,Likes,Intent analysis,Content analysis,Distance to parent,Distance to post,Doc2Vec value
0,79701,79701,0,,,0.0,1.1102230246251563e-16,[-0.033691 -0.037016 0.029007 0.008522 -0.0166...
1,79702,79701,2,м,20.0,1.017779691860642,1.017779691860642,[-0.020174 0.017773 0.023126 0.017003 0.041832...
2,79710,79701,0,у,30.0,1.036869972583668,1.0201519054057373,[0.024097 0.024659 -0.037547 0.032659 0.078938...
3,79711,79701,0,и,40.0,0.8629165321940944,0.9791444162880744,[0.027835 -0.014688 -0.047072 0.005525 0.02313...


In [127]:
pd.read_hdf(r'C:\\Users\\Nikolay\\Desktop\\coll\\8-comm_rosbalt_39_79701_output_10.h5')

Unnamed: 0,ID of comment,ID of post,Likes,Intent analysis,Content analysis,Distance to parent,Distance to post,Doc2Vec value
0,79701,79701,0,,,0.0,1.1102230246251563e-16,[-0.033691 -0.037016 0.029007 0.008522 -0.0166...
1,79744,79701,0,и,4.0,0.7546659771982847,0.7546659771982847,[-0.028849 -0.061239 -0.010560 0.016685 0.0027...
2,79751,79701,1,ш,4.0,0.0398518796587953,0.7871806515179766,[-0.030819 -0.058584 -0.000901 -0.006251 -0.00...
3,79752,79701,1,ж,4.0,0.0747844151326491,0.7804029784202833,[-0.026400 -0.050600 -0.030745 0.025348 0.0184...
4,79753,79701,1,ш,4.0,0.0270951991169325,0.7546659771982847,[-0.028849 -0.061239 -0.010560 0.016685 0.0027...
5,79776,79701,3,ч,4.0,0.8295794789631287,0.2508403434496574,[-0.031889 -0.025769 0.032461 -0.006925 0.0022...
6,79778,79701,0,п,4.0,0.2966077603882585,0.3147115942277629,[-0.018071 -0.009378 0.044637 0.006586 -0.0093...
7,79783,79701,0,и,4.0,0.4713635431564371,0.4983164208788222,[-0.022603 0.006730 0.019023 -0.011535 -0.0062...
8,79784,79701,0,щ,4.0,0.4844836144391587,0.2562199197554267,[-0.004581 -0.021733 0.019696 0.011457 0.00209...


In [5]:
generalization_rule = {"I": ["а", "б", "в", "г", "д"],  # Информативно-воспроизводящий
                       "E": ["е", "ж", "з", "и", "к"],  # Эмотивно-консолидирующий
                       "M": ["л", "м", "н", "о", "п"],  # Манипулятивный тип, доминирование
                       "D": ["р", "с", "т", "у", "ф"],  # Волюнтивно-директивный
                       "R": ["х", "ц", "ч", "ш", "щ"]}  # Контрольно-реактивный