In [59]:
from model.model_fn import ModelWrapper
import tensorflow as tf
import numpy as np
from nltk import ngrams
import pandas as pd
from collections import Counter
from model.utils import clean

# concepts

In [56]:
data = pd.read_csv('data/concept_search.csv')

In [61]:
data['text'] = data['text'].apply(clean)

In [86]:
def get_bigrams(text):
    words = text.split()
    bigrams = ngrams(words, 2)
    return list(bigrams)

def get_quadgrams(text):
    words = text.split()
    bigrams = ngrams(words, 4)
    return list(bigrams)

In [87]:
counter = Counter()

for t in data['text']:
    counter.update(get_quadgrams(t))

  if __name__ == '__main__':


In [94]:
concepts = [' '.join(a) for a, i in counter.most_common()]

In [99]:
rus_concepts = [c for c in concepts if 'путин' in c]

In [101]:
# rus_concepts

# Ideas

* bigrams as concepts

In [9]:
endpoints = dict(
    input_='model/dim_reduction', 
    bottleneck='model/bottleneck/BiasAdd',
    probs='model/Softmax',
    output='model/output_logits/BiasAdd'
)

mw = ModelWrapper('experiments/test/', endpoints)

graph = tf.get_default_graph()

sess = tf.Session(graph=graph)
sess.run(tf.global_variables_initializer())

INFO:tensorflow:Restoring parameters from experiments/test/model.ckpt-955


In [6]:
text = 'обама прибыл в оон'
text2 = 'обама пошел в магазин'
labels = np.array([0])  # some pseudo-label

In [7]:
preds = mw.calculate_output(sess, text)

In [8]:
preds.shape  # 48 classes

(1, 48)

In [9]:
grads = mw.calculate_grad(sess, text2, labels)

In [10]:
grads.shape  # 256 is the size of the hidden layer (bottleneck)

(1, 256)

# Concepts

In [11]:
import pandas as pd
from model.utils import clean
import numpy as np

In [12]:
# data = pd.read_csv('data/concept_search.csv', nrows=None)

# concepts = data[data['text'].str.contains('Путин')].sample(500)['text'].apply(clean).apply(lambda x: x.replace('\xa0', ' '))
# random_data = data.sample(500)['text'].apply(clean).apply(lambda x: x.replace('\xa0', ' '))
# np.save('data/concepts.npy', concepts.values)
# np.save('data/random_data.npy', random_data.values)

In [13]:
concepts = np.load('data/concepts.npy')
random_data = np.load('data/random_data.npy')

In [14]:
%%time

X_conc = mw.calculate_bottleneck(sess, concepts)

X_rand = mw.calculate_bottleneck(sess, random_data)

CPU times: user 984 ms, sys: 199 ms, total: 1.18 s
Wall time: 413 ms


In [15]:
X = np.append(X_conc, X_rand, axis=0)
y = np.array([1] * len(X_conc) + [0] * len(X_rand))

In [16]:
np.save('data/X.npy', X)
np.save('data/y.npy', y)

# CAVs

In [1]:
import numpy as np
from cav import CAV

In [2]:
X = np.load('data/X.npy')
y = np.load('data/y.npy')

In [3]:
cav = CAV()

In [4]:
v = cav.fit(X, y)

In [5]:
np.save('data/cav.npy', v)

In [6]:
v.shape

(256,)

# TCAV

In [24]:
import pandas as pd
from model.model_fn import ModelWrapper
import tensorflow as tf
import numpy as np
import pickle

In [8]:
endpoints = dict(
    input_='model/dim_reduction', 
    bottleneck='model/bottleneck/BiasAdd',
    probs='model/Softmax',
    output='model/output_logits/BiasAdd'
)

mw = ModelWrapper('experiments/test/', endpoints)

graph = tf.get_default_graph()

sess = tf.Session(graph=graph)
sess.run(tf.global_variables_initializer())

INFO:tensorflow:Restoring parameters from experiments/test/model.ckpt-955


In [21]:
train = pd.read_csv('data/48labs/train.csv')

labs = train['labels'].values
texts = train['text'].tolist()

In [5]:
%%time

grads = mw.calculate_grad(sess, texts, labs)

CPU times: user 4min 23s, sys: 44.7 s, total: 5min 8s
Wall time: 1min 38s


In [7]:
#  np.save('data/train_grads.npy', grads)

In [28]:
labs_mapping = pickle.load(open('data/labs_mapping.pkl', 'rb'))

labs_mapping_inverse = {val: key for key, val in labs_mapping.items()}

In [9]:
grads = np.load('data/train_grads.npy')
v = np.load('data/cav.npy')

In [14]:
dot_prod = grads.dot(v)

In [33]:
tcav_scores = {}

for lab in labs:
    tcav_score = (dot_prod[labs == lab] > 0).mean()
    tcav_scores[lab] = tcav_score

In [35]:
tcav_scores_names = {labs_mapping_inverse[key]: val for key, val in tcav_scores.items()}

In [36]:
tcav_scores_names

{'Coцсети': 0.0,
 'Бизнес': 0.7894371091035441,
 'Бокс и ММА': 0.0911214953271028,
 'Все': 0.9888172476324804,
 'Гаджеты': 0.8403141361256544,
 'Город': 0.01272264631043257,
 'Госэкономика': 0.9835701598579041,
 'Дача': 0.7473684210526316,
 'Деловой климат': 0.9849462365591398,
 'Деньги': 1.0,
 'Закавказье': 0.9523809523809523,
 'Звери': 0.03759398496240601,
 'Зимние виды': 0.6791808873720137,
 'Игры': 0.046831955922865015,
 'Интернет': 0.539568345323741,
 'Искусство': 0.9235668789808917,
 'Квартира': 0.7507598784194529,
 'Кино': 0.3343789209535759,
 'Книги': 0.3459715639810427,
 'Конфликты': 0.0036429872495446266,
 'Космос': 0.9921135646687698,
 'Криминал': 0.5592841163310962,
 'Летние виды': 0.8774703557312253,
 'Люди': 0.0,
 'Мир': 0.9919137466307277,
 'Москва': 0.9869565217391304,
 'Музыка': 0.30814524043179586,
 'Наука': 1.0,
 'Общество': 0.15446817333609786,
 'Оружие': 0.7746478873239436,
 'Политика': 0.0,
 'Полиция и спецслужбы': 0.3181818181818182,
 'Пресса': 0.0,
 'Преступност