In [0]:
from google.colab import drive
import xmltodict
import pandas as pd
import re
import numpy as np
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer


In [2]:
drive.mount('/content/drive/')
#!pwd
!ls /content/drive/My\ Drive/NLP/Task1/
#!ls drive/
%cd /content/drive/My\ Drive/NLP/Task1/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
banks_test_2016.xml    cc.ru.300.bin	  tkk_test_etalon.xml
banks_test_etalon.xml  db.sql		  tkk_train_2016.xml
bank_train_2016.xml    tkk_test_2016.xml
/content/drive/My Drive/NLP/Task1


## Prepare data

In [0]:
def get_sample_text(sample):
    assert sample['column'][3]['@name'] == 'text'
    return sample['column'][3]['#text']


def get_sample_answers_bank(sample):
    answers = {}
    companies ={}
    for i in range(4, 12):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_answers_tkk(sample):
    answers = {}
    companies ={}
    for i in range(4, 11):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_id(sample):
    assert sample['column'][0]['@name'] == 'id'
    return int(sample['column'][0]['#text'])


def get_data(filename):
    df = pd.DataFrame()
    with open(filename, "r", encoding='utf-8') as f:
        d = xmltodict.parse(f.read(), process_namespaces=True)
        clean_samples = []
        for sample in d['pma_xml_export']['database']['table']:
            sample_id = get_sample_id(sample)
            text = get_sample_text(sample)
            answers = get_sample_answers_bank(sample)
            for company, answer in answers.items():
                if answer is not None:
                    clean_samples.append((sample_id, text, company, answer))
        df['text'] = [sample[1] for sample in clean_samples]
        df['answer'] = [sample[3] for sample in clean_samples]
        df['company'] = [sample[2] for sample in clean_samples]
        df['sample_id'] = [sample[0] for sample in clean_samples]
    return df

In [0]:
def char2vec(char, c2v):
  if char in c2v:
    return c2v[char]
  else:
    return c2v['unknown']

def str2vec(string,c2v,char_len =70):
  data = np.zeros((char_len,len(c2v)))
  str_len=len(string)
  if str_len<char_len:
    string =string.join([' ' for i in range(char_len-str_len+1)])
  data[:,:]=np.array([char2vec(ch,c2v) for ch in string[:char_len]])
  return data

def clean_str(str_):
  alphabet = r'[^йцукенгшщзхъфывапролджэячсмитьбю\(\)1234567890<>]'
  str_=re.sub(alphabet,' ',str_)
  #str_=re.sub(r'[…\+\-\'’»«._\n%/=|~\xa0©*#?=&$·<>?!\"@^:,.;\[\]]',' ',str_)
  return str_

In [0]:
#root_dir = '/content/drive/My\ Drive/NLP/Task1/'
train_filename = "bank_train_2016.xml"
test_filename = "banks_test_2016.xml"

train = get_data(train_filename)
test = get_data(test_filename)

url_replacement = lambda x: re.sub(r'(?:http[^\s]+)($|\s)', '', x)
user_replacement = lambda x: re.sub(r'(?:@[^\s]+)($|\s)', '', x)
#punctuation= lambda x:re.sub(r'',r'',x)

train['text'] = train['text'].apply(url_replacement)
train['text'] = train['text'].apply(user_replacement)
test = test.dropna()

test['text'] = test['text'].apply(url_replacement)
test['text'] = test['text'].apply(user_replacement)

train['text']=train.text.str.lower()
test['text'] =test.text.str.lower()
train['text']=train.text.apply(clean_str)
test['text']=test.text.apply(clean_str)

char_len = 70
voc =set([char for char in train.text.str.cat(sep = '')])
voc_size = len(voc)
matrix =np.eye(voc_size+1)
char_d = {key:matrix[i,:] for i,key in enumerate(list(voc))}
char_d['unknown'] = matrix[-1,:]

to_vec = lambda x:str2vec(x,char_d)
train['vector'] = train.text.apply(to_vec)
test = test.dropna()
test['vector'] =test.text.apply(to_vec)

### Prepare dataset

In [0]:
size = train.vector.values.shape[0]
y_dict= {-1:[1,0,0],0:[0,1,0],1:[0,0,1]}
X = np.zeros((size,char_len,voc_size+1))
Y = np.zeros((size,3))
for i in range(size):
  X[i,:,:] = train.vector.values[i]
  Y[i,:] = np.array(y_dict[train.answer[i]])
size=test.vector.values.shape[0]
X_test = np.zeros((size,char_len,voc_size+1))
Y_test = np.zeros((size,3))
for i in range(size):
  X_test[i,:,:] = test.vector.values[i]
  Y_test[i,:] = np.array(y_dict[test.answer[i]])

### Train/Val samples

In [0]:
part = 0.8
index =int( X.shape[0]*part)
X_train,y_train,X_val,y_val = X[:index,:,:],Y[:index],X[index:],Y[index:]
#X_val,y_val = X_val[train.answer[index:]!=0],y_val[train.answer[index:]!=0]

In [0]:
input_shape1=X.shape[1:]

In [0]:
from tensorflow.python.keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D,Flatten
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Sequential
from sklearn.metrics import f1_score

In [0]:
def f1(y_true, y_pred): ###calculating the f1 scores
  y_pred = K.round(y_pred)
  tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
  fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
  fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

  p = tp / (tp + fp + K.epsilon())
  r = tp / (tp + fn + K.epsilon())

  f1 = 2*p*r / (p+r+K.epsilon())
  f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
  return K.mean(f1)
#def f1(y_true, y_pred):
#  return f1_score(y_true,y_pred)

In [0]:
model = Sequential()
model.add(Conv1D(500,(7),activation ='relu',input_shape = input_shape1))
model.add(MaxPooling1D(2))
model.add(Conv1D(500,(7),activation ='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(500,(5),activation ='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(500,(3),activation ='relu',input_shape = input_shape1))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(1024,activation ='relu'))
model.add(Dropout(rate = 0.5))
model.add(Dense(3,activation ='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc',f1])

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 64, 500)           161500    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 32, 500)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 26, 500)           1750500   
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 13, 500)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 9, 500)            1250500   
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 4, 500)            0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 2, 500)            750500    
__________

In [16]:
history = model.fit(X_test,Y_test,
         batch_size = 128,
         validation_data = (X_val,y_val),
         epochs =4)   #### epochs averaging 5-6 minutes, number of epochs reduced

Train on 19811 samples, validate on 2145 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
#np.argmax(model.predict(np.expand_dims(test.vector[:],axis =0)))
model.evaluate(x=X_test,y=Y_test,batch_size = 100)



[1.1920928955078125e-07, 1.0, 0.3333333432674408]