## Lab 1 Part 2 - 3

In [0]:
from gensim.models.fasttext import FastText
import xmltodict
import pandas as pd
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from tensorflow.python.keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D,Flatten,BatchNormalization,Activation
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Sequential

In [0]:
def get_sample_text(sample):
    assert sample['column'][3]['@name'] == 'text'
    return sample['column'][3]['#text']


def get_sample_answers_bank(sample):
  
    answers = {}
    companies ={}
    
    
    for i in range(4, 12):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_answers_tkk(sample):
  
  
    answers = {}
    companies ={}
    for i in range(4, 11):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_id(sample):
    assert sample['column'][0]['@name'] == 'id'
    return int(sample['column'][0]['#text'])


def get_data(filename):
    df = pd.DataFrame()
    
    
    with open(filename, "r", encoding='utf-8') as f:
        d = xmltodict.parse(f.read(), process_namespaces=True)
        clean_samples = []
        for sample in d['pma_xml_export']['database']['table']:
            sample_id = get_sample_id(sample)
            text = get_sample_text(sample)
            answers = get_sample_answers_bank(sample)
            for company, answer in answers.items():
                if answer is not None:
                    clean_samples.append((sample_id, text, company, answer))
        df['text'] = [sample[1] for sample in clean_samples]
        df['answer'] = [sample[3] for sample in clean_samples]
        df['company'] = [sample[2] for sample in clean_samples]
        df['sample_id'] = [sample[0] for sample in clean_samples]
    return df

def clean_str(str_):
  alphabet = r'[^йцукенгшщзхъфывапролджэячсмитьбю\(\)1234567890<>]'
  str_=re.sub(alphabet,' ',str_)
  return str_

In [4]:
%cd /content/drive/My\ Drive/NLP/Task1/

/content/drive/My Drive/NLP/Task1


### Load and clean

In [0]:
train_filename = "bank_train_2016.xml"
test_filename = "banks_test_2016.xml"

train = get_data(train_filename)
test = get_data(test_filename)

url_replacement = lambda x: re.sub(r'(?:http[^\s]+)($|\s)', '', x)
user_replacement = lambda x: re.sub(r'(?:@[^\s]+)($|\s)', '', x)
#punctuation= lambda x:re.sub(r'',r'',x)

train['text'] = train['text'].apply(url_replacement)
train['text'] = train['text'].apply(user_replacement)
test = test.dropna()

test['text'] = test['text'].apply(url_replacement)
test['text'] = test['text'].apply(user_replacement)

train['text']=train.text.str.lower()
test['text'] =test.text.str.lower()
train['text']=train.text.apply(clean_str)
test['text']=test.text.apply(clean_str)

#test.vector.apply(lambda x:x.shape).max()

### Vectorizing

In [0]:
def convert_to_vectors(model,list_text):
    list_vect = []
    for sentence in list_text:
        sentence_vect = []
        for word in sentence:
            if word.lower() in model.wv.vocab:
                sentence_vect.append(model.wv[word])
            else:
                sentence_vect.append(np.zeros(model.vector_size))
        list_vect.append(np.array(sentence_vect))
    return list_vect
def pad_seq(sent,max_len = 32):
    pad_len = max_len -sent.shape[0]
    if pad_len>0:
        return np.concatenate((sent,np.zeros((pad_len,sent.shape[1]))),axis =0)
    else:
        return sent

In [0]:
fasttext_model = FastText.load_fasttext_format('cc.ru.300.bin')

###Tweet tokenizing from the FastText Model

In [8]:
tknzr = TweetTokenizer()
train['sent_split']=train.text.apply(tknzr.tokenize)
test['sent_split']=test.text.apply(tknzr.tokenize)
train['vector']=convert_to_vectors(fasttext_model,train.sent_split.values)
test['vector']=convert_to_vectors(fasttext_model,test.sent_split.values)
train['vector'] = train.vector.apply(pad_seq)
test['vector'] = test.vector.apply(pad_seq)
train.head()

Unnamed: 0,text,answer,company,sample_id,sent_split,vector
0,взять кредит тюмень альфа банк,0,alfabank,1,"[взять, кредит, тюмень, альфа, банк]","[[0.020303398370742798, -0.010869510471820831,..."
1,мнение о кредитной карте втб 24,0,vtb,2,"[мнение, о, кредитной, карте, втб, 24]","[[-0.008030472323298454, 0.04402994364500046, ..."
2,райффайзенбанк снижение ключевой ставки цб ...,0,raiffeisen,3,"[райффайзенбанк, снижение, ключевой, ставки, ц...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,современное состояние кредитного поведения в р...,0,sberbank,4,"[современное, состояние, кредитного, поведения...","[[0.00026070530293509364, 0.043691426515579224..."
4,главное чтоб банки сбер и втб,1,sberbank,5,"[главное, чтоб, банки, сбер, и, втб]","[[0.041497353464365005, 0.02926076017320156, 0..."


### Prepare to train and test

In [0]:
def panda2matrix(df):
    size = train.vector.shape[0]
    y_dict= {-1:[1,0,0],0:[0,1,0],1:[0,0,1]}
    num_vect,vec_size = df.vector[0].shape
    X=np.zeros((size,num_vect,vec_size))
    Y = np.zeros((size,3))
    for i in range(size):
        X[i,:,:] = df.vector.values[i]
        Y[i,:] = np.array(y_dict[df.answer[i]])
    return X,Y

In [0]:
X,Y = panda2matrix(train)
X_test,Y_test = panda2matrix(train)

part = 0.8
index =int( X.shape[0]*part)
X_train,y_train,X_val,y_val = X[:index,:,:],Y[:index],X[index:],Y[index:]

In [0]:
def f1(y_true, y_pred):   #### for calculatiing tpr, fpr, f1 values
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

### CNN Model

In [12]:
input_shape1=X.shape[1:]
model = Sequential()
model.add(Conv1D(100,(4),input_shape = input_shape1))

model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(100,(4)))

model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(100,(4,)))

model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(200))

model.add(Activation('relu'))
model.add(Dropout(rate = 0.5))
model.add(Dense(3,activation ='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc',f1])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 29, 100)           120100    
_________________________________________________________________
activation_1 (Activation)    (None, 29, 100)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 14, 100)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 11, 100)           40100     
_________________________________________________________________
activation_2 (Activation)    (None, 11, 100)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 5, 100)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2, 100)            40100     
__________

### Train

In [13]:
history = model.fit(X_train,y_train,
         batch_size = 100,
         validation_data = (X_val,y_val),
         epochs =4)

Train on 8580 samples, validate on 2145 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


### Test

In [14]:
model.evaluate(x=X_test,y=Y_test,batch_size = 100)



[0.2153184226321139, 0.9254079317990994, 0.7212931859048652]

In [0]:
#np.save('train_X.npy', X) 
#np.save('train_Y.npy', Y)
#np.save('test_X.npy', X_test) 
#np.save('test_Y.npy', Y_test) 

In [15]:
#!wget https://www.dropbox.com/s/9egqjszeicki4ho/db.sql

--2019-06-12 17:03:23--  https://www.dropbox.com/s/9egqjszeicki4ho/db.sql
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.1, 2620:100:6030:1::a27d:5001
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/9egqjszeicki4ho/db.sql [following]
--2019-06-12 17:03:23--  https://www.dropbox.com/s/raw/9egqjszeicki4ho/db.sql
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc2886c4f1bd4c016096f128f5a4.dl.dropboxusercontent.com/cd/0/inline/AitI5cY8WAgmLQ996rPMKQLs55ZzF3JKRI7tkiHqwGoG68pTITDxPsaXf5KfrDYM8T4DRLdvcHyR0U_-qjqft2ZF2YwVVU5SRTciKbElL9fMMA/file# [following]
--2019-06-12 17:03:23--  https://uc2886c4f1bd4c016096f128f5a4.dl.dropboxusercontent.com/cd/0/inline/AitI5cY8WAgmLQ996rPMKQLs55ZzF3JKRI7tkiHqwGoG68pTITDxPsaXf5KfrDYM8T4DRLdvcHyR0U_-qjqft2ZF2YwVVU5SRTciKbElL9fMMA/file
Resolving uc2886c4f1bd4c016096f128

### Part 3

In [0]:
def clean_str2(str_):
    str_ =str_.lower()
    alphabet = r'[^\sйцукенгшщзхъфывапролдж\-\+эячсмитьбю\(\)1234567890<>]'
    str_=re.sub('@\w+','',str_)
    str_=re.sub(r'\)+',')',str_)
    str_=re.sub(alphabet,'',str_)
    str_=re.sub(r'\xa0',' ',str_)
    str_=str_.replace('\n', ' ')
    str_ = re.sub(r'([а-я])\1*' ,r'\1',str_)
    return str_

In [0]:
data_pos =pd.read_csv('positive.csv',sep =';',header=None)
data_neg =pd.read_csv('negative.csv',sep =';',header=None)
data_pos.columns = ["c{0}".format(i) for i in range(12)]
data_neg.columns = ["c{0}".format(i) for i in range(12)]

In [0]:
texts1 = data_pos.c3.values
texts2 = data_neg.c3.values
texts1 =list(map(clean_str2,texts1))
texts2 =list(map(clean_str2,texts2))
texts1.extend(texts2)
text1 = list(map(lambda x:x.split(),texts1))
del data_pos
del data_neg

In [0]:
model2 = FastText(size=300)  # instantiate
batch_size =int(len(texts1)//10)
for i in range(10):
    if i==4:
        texts = text1[i*batch_size:]
    else:
        texts = text1[i*batch_size:(i+1)*batch_size]
    if i==0:
        model2.build_vocab(sentences = texts)
    else:
        model2.build_vocab(sentences = texts, update = True)
    model2.train(sentences=texts, total_examples=len(texts), epochs=4)
model2.save('w2v_model')

### Classification with general word2vec embeddings

In [0]:
train_filename = 'bank_train_2016.xml'
test_filename ='banks_test_2016.xml'
train =get_data(train_filename)
test = get_data(test_filename)
test = test.dropna()
train = train.dropna()
url_replacement = lambda x: re.sub(r'(?:http[^\s]+)($|\s)', '', x)
user_replacement = lambda x: re.sub(r'(?:@[^\s]+)($|\s)', '', x)
#punctuation= lambda x:re.sub(r'',r'',x)

train['text'] = train['text'].apply(url_replacement)
train['text'] = train['text'].apply(user_replacement)
test['text'] = test['text'].apply(url_replacement)
test['text'] = test['text'].apply(user_replacement)

train['text']=train.text.apply(clean_str2)
test['text']=test.text.apply(clean_str2)

In [0]:
tknzr = TweetTokenizer()
train['sent_split']=train.text.apply(tknzr.tokenize)
test['sent_split']=test.text.apply(tknzr.tokenize)
train['vector']=convert_to_vectors(model2,train.sent_split.values)
test['vector']=convert_to_vectors(model2,test.sent_split.values)
train['vector'] = train.vector.apply(pad_seq)
test['vector'] = test.vector.apply(pad_seq)
train.head()

NameError: ignored

In [0]:
X,Y = panda2matrix(train)
X_test,Y_test = panda2matrix(train)

part = 0.8
index =int( X.shape[0]*part)
X_train,y_train,X_val,y_val = X[:index,:,:],Y[:index],X[index:],Y[index:]

In [0]:
input_shape1=X.shape[1:]

model = Sequential()
model.add(Conv1D(100,(3),input_shape = input_shape1))

model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(100,(4)))

model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(100,(4)))

model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(200))

model.add(Activation('relu'))
model.add(Dropout(rate = 0.5))
model.add(Dense(3,activation ='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc',f1])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 30, 100)           90100     
_________________________________________________________________
activation_12 (Activation)   (None, 30, 100)           0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 15, 100)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 12, 100)           40100     
_________________________________________________________________
activation_13 (Activation)   (None, 12, 100)           0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 6, 100)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 3, 100)            40100     
__________

In [0]:
history = model.fit(X_train,y_train,
         batch_size = 100,
         validation_data = (X_val,y_val),
         epochs =4)

Train on 8580 samples, validate on 2145 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [0]:
model.evaluate(x=X_test,y=Y_test,batch_size = 100)



[0.2420518649286115, 0.92960376, 0.7936848]