## Рекомендательная система с помощью логистической регрессии и топикал чата

In [2]:
import json
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
import pickle

In [3]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [399]:
import nltk

In [138]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


### Предобработка

In [6]:
def prep(s):
    words = s.split(" ")
    if len(words) == 1:
        result = lemmatizer.lemmatize(words[0].lower())
    elif len(words) > 1:
        result = " ".join([lemmatizer.lemmatize(w.lower()) for w in words])
    else:
        result = ""
    return result

### Чтение данных и подготовка классов

In [4]:
data = json.load(open("Downloads/train_ner_annotated_topical_chat_4152.json", "r"))

In [47]:
len(data) * 20

83040

In [45]:
count = 0

for d in data:
    for utt in d:
        count += 1

In [46]:
count

157776

In [32]:
tp_topics = {}
tp_labels_ = {}

for sample in list(data.values()):
    for topics in sample:
        for topic in topics["cobot_ner"]["response"]:
            if topic["label"] not in ["misc", "anaphor", "number", "duration", "year", "date"]:
                clear_topic = prep(topic["text"])
                if clear_topic in tp_topics.keys():
                    tp_topics[clear_topic] += 1
                    tp_labels_[clear_topic].append(topic["label"])
                else:
                    tp_topics[clear_topic] = 1
                    tp_labels_[clear_topic] = [topic["label"]]
                    
tp_labels = {k: list(set(v)) for k, v in tp_labels_.items()}

Классы по cobot labels

In [40]:
wip_labels = [el[1] for el in list(tp_labels.items())]
popular_topics = list(set([e for el in wip_labels for e in el]))

In [43]:
popular_topics

['songname',
 'sportrole',
 'person',
 'bookname',
 'ordinal',
 'venue',
 'organization',
 'wear',
 'channelname',
 'party',
 'gamename',
 'albumname',
 'sportteam',
 'videoname',
 'device',
 'genre',
 'sport',
 'position',
 'softwareapplication',
 'location',
 'event',
 'vehicle']

Или по cobot ner

In [8]:
#здесь запрещаем определенные entities

forbidden = ['of', 'the', 'The', 'us', 'it']

In [9]:
popular_topics = [k for k, v in sorted(tp_topics.items(), key=lambda item: item[1]) if k not in forbidden]

In [10]:
len(popular_topics)

10061

In [33]:
popular_topics = [k for k, v in sorted(tp_topics.items(), key=lambda item: item[1]) if k not in forbidden][-200:]

In [381]:
popular_topics

['usa',
 'space shuttle',
 'the shining',
 'the office',
 'camera',
 'c lewis',
 'uranus',
 'rugrats',
 'fiction',
 'demetri martin',
 'governor',
 'detroit',
 'library of alexandria',
 'pakistan',
 'california',
 'brazil',
 'fox news',
 'george lucas',
 'kanye',
 'mlb',
 'hybrid theory',
 'alaska',
 'pop',
 'ronald reagan',
 'wonder woman',
 'stan lee',
 'library',
 'frozen',
 'carol burnett',
 'space jam',
 'air jordan',
 'south park',
 'golden state warrior',
 'jon hamm',
 'south africa',
 'military',
 'norway',
 'tolkien',
 'bible',
 'spiderman',
 'yankee',
 'istanbul',
 'queen',
 'house',
 'chicago',
 'france',
 'africa',
 'solar system',
 'sesame street',
 'taylor swift',
 'alabama',
 'pokemon',
 'michigan',
 'australia',
 'croatia',
 'walmart',
 'cell phone',
 'unicef',
 'michael jordan',
 'cadillac',
 'united state',
 'rap music',
 'horror',
 'imdb',
 'busta rhyme',
 'kickball',
 'allen iverson',
 'mcdonalds',
 'rock',
 'aladdin',
 '3rd rock from the sun',
 'burger king',
 'tim

### Составление последовательности тем диалога в зависимости от выбранного набора топиков

Темы составленные по Entities

In [34]:
dialogues = []

for sample in list(data.values()):
    dialogue = []
    for topics in sample:
        rep = []
        for topic in topics["cobot_ner"]["response"]:
            rep.append(prep(topic["text"]))
        dialogue.append(rep)
    dialogues.append(dialogue)

Темы составленные по Labels

In [22]:
dialogues = []

for sample in list(data.values()):
    dialogue = []
    for topics in sample:
        rep = []
        for topic in topics["cobot_ner"]["response"]:
            rep.append(prep(topic["label"]))
        dialogue.append(rep)
    dialogues.append(dialogue)

In [430]:
dialogues[1]

[['misc'],
 ['person', 'misc'],
 ['anaphor', 'misc', 'year'],
 ['person', 'misc'],
 ['anaphor', 'videoname'],
 ['misc', 'number', 'wear', 'duration'],
 ['misc', 'misc'],
 ['misc', 'misc'],
 [],
 ['anaphor', 'anaphor'],
 ['person', 'misc'],
 ['anaphor', 'anaphor', 'misc'],
 ['person'],
 ['anaphor'],
 ['videoname'],
 ['anaphor'],
 ['anaphor', 'misc', 'misc'],
 ['misc', 'misc'],
 ['anaphor', 'person'],
 [],
 []]

### Проверка на дублирование диалогов 

################################

In [29]:
dups = []

for i, elem1 in enumerate(list(data.values())):
    for j, elem2 in enumerate(list(data.values())[i+1:]):
        if (elem1[0]['text'] == elem2[0]['text']) and (elem1[-1]['text'] == elem2[-1]['text']) and (elem1[1]['text'] == elem2[1]['text']):
            dups.append((elem1, elem2))

In [16]:
len(list(data.values()))

4152

##############################

# Первый способ 

### Векторы признаков 

In [35]:
pos_features = []
neg_features = []
triples = []
true_pairs = []
false_pairs = []

for d in dialogues:
    for i, _ in enumerate(d[:-1]):
        d_topics = list(set([e for el in d[:i+1] for e in el]))  
        res_ = [1 if r in d_topics else 0 for r in popular_topics]
        res = list(np.array(res_)/sum(res_)) if sum(res_) > 0 else [0] * len(res_)
        curr = d[i]
        foll = d[i+1]
        for c in curr:
            for f in foll:
                if c in popular_topics and f in popular_topics and c != f:
                    curr_v = [1 if c == w else 0 for w in popular_topics]
                    foll_v = [1 if f == w else 0 for w in popular_topics]
                    true_pairs.append((c, f))
                    final_v = curr_v + res + foll_v
                    pos_features.append((final_v, 1))
                    neg_idx = random.randint(0, len(popular_topics)-1)
                    neg_pair = popular_topics[neg_idx]                    
                    while neg_pair == f:
                        neg_idx = random.randint(0, len(popular_topics)-1)
                        neg_pair = popular_topics[neg_idx]
                    false_pairs.append((c, neg_pair))
                    triples.append([c, f, neg_pair])
                    neg_v = [1 if neg_pair == w else 0 for w in popular_topics]
                    final_neg_v = curr_v + res + neg_v
                    neg_features.append((final_neg_v, 0))

In [36]:
features = pos_features + neg_features

In [37]:
random.shuffle(features)

In [38]:
len(features)

44214

### Разбиение на трейн и тест

In [27]:
X = [el[0] for el in features]
y = [el[1] for el in features]
idx = 95000
train_X = X[:idx]
train_y = y[:idx]
test_X = X[idx:]
test_y = y[idx:]
show = X[idx:idx]
show_y = y[idx:idx]

In [31]:
len(test_X)

10286

In [419]:
len(features)

105286

### Обучение и результаты

In [29]:
clf = LogisticRegression(random_state=42, class_weight='balanced', multi_class='ovr', warm_start=True).fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [428]:
MODEL_NAME = "logreg_recommendation_model.pkl"
with open(MODEL_NAME, 'wb') as file:
    pickle.dump(clf, file)

In [30]:
clf.score(test_X, test_y)

0.7634649037526735

# Второй способ: через фильмы

### Сбор векторов для поиска похожести 1) по контексту и следующей теме (пользователь) 2) по контексту и предыдущей теме (фильм)

In [383]:
topic_tmp_representation = {topic: [] for topic in popular_topics}
topic_tmp_representation2 = {topic: [] for topic in popular_topics}
true_pairs = []
false_pairs = []

for d in dialogues:
    for i, _ in enumerate(d[:-1]):
        d_topics = list(set([e for el in d[:i+1] for e in el]))  
        res_ = [1 if r in d_topics else 0 for r in popular_topics]
        res = list(np.array(res_)/sum(res_)) if sum(res_) > 0 else [0] * len(res_)
        curr = d[i]
        foll = d[i+1]
        for c in curr:
            for f in foll:
                if c in popular_topics and f in popular_topics and c != f:
                    foll_v = [1 if f == w else 0 for w in popular_topics]
                    curr_v = [1 if c == w else 0 for w in popular_topics]
                    topic_tmp_representation[c].append((res, foll_v))
                    topic_tmp_representation2[f].append((res, curr_v))
                    true_pairs.append((c, f))
                    neg_idx = random.randint(0, len(popular_topics)-1)
                    neg_pair = popular_topics[neg_idx]                    
                    while neg_pair == f:
                        neg_idx = random.randint(0, len(popular_topics)-1)
                        neg_pair = popular_topics[neg_idx]
                    false_pairs.append((c, neg_pair))

### Построение промежуточных матриц для поиска ближайших пользователей и фильмов

In [384]:
topic_representation = {}

for key, value in topic_tmp_representation.items():
    topic_len = len(value)
    avg_foll = list(sum(np.array([el[1] for el in value]))/topic_len)
    avg_res = list(sum(np.array([el[0] for el in value]))/topic_len)
    repres = avg_foll + avg_res
    topic_representation[key] = repres
    
repres_matrix = np.array([value for key, value in topic_representation.items()], dtype=object)

In [385]:
topic_representation2 = {}

for key, value in topic_tmp_representation2.items():
    topic_len = len(value)
    avg_foll = list(sum(np.array([el[1] for el in value]))/topic_len)
    avg_res = list(sum(np.array([el[0] for el in value]))/topic_len)
    repres = avg_foll + avg_res
    topic_representation2[key] = repres
    
repres_matrix2 = np.array([value for key, value in topic_representation2.items()], dtype=object)

### Построение матриц  с репрезентациями по ближайшим фильмам и пользователям  (параметры: сколько пользователей и фильмов брать, что делать с двумя векторами)

In [389]:
top_topics = {}    
top_k = 50
    
for i, key in enumerate(list(topic_tmp_representation.keys())):
    current_matrix = np.concatenate((repres_matrix[:i], repres_matrix[i+1:]))
    #print(current_matrix.shape)
    #print(repres_matrix[i].shape)
    closest_topics_idx = list(np.argsort(current_matrix.dot(repres_matrix[i]), -1)[:top_k]) + [i]
    closest_topics = [popular_topics[idx] for idx in closest_topics_idx]
    top_topics[key] = [1 if j in closest_topics_idx else 0 for j in range(len(popular_topics))]

In [390]:
top_topics2 = {}    
top_k = 5
    
for i, key in enumerate(list(topic_tmp_representation2.keys())):
    current_matrix = np.concatenate((repres_matrix2[:i], repres_matrix2[i+1:]))
    #print(current_matrix.shape)
    #print(repres_matrix[i].shape)
    closest_topics_idx = list(np.argsort(current_matrix.dot(repres_matrix2[i]), -1)[:top_k]) + [i]
    closest_topics = [popular_topics[idx] for idx in closest_topics_idx]
    top_topics2[key] = [1 if j in closest_topics_idx else 0 for j in range(len(popular_topics))]

### Построение выборки  

In [391]:
feature_pairs = []

for feat in true_pairs:
    feature_pairs.append((top_topics[feat[0]] + top_topics2[feat[1]], 1))
    
for feat in false_pairs:
    feature_pairs.append((top_topics[feat[0]] + top_topics2[feat[1]], 0))

In [392]:
random.shuffle(feature_pairs)

In [393]:
len(feature_pairs[0][0])

400

In [394]:
X = [el[0] for el in feature_pairs]
y = [el[1] for el in feature_pairs]
idx = (len(X) * 9) // 10
train_X = X[:idx]
train_y = y[:idx]
test_X = X[idx:]
test_y = y[idx:]
show = X[idx:idx]
show_y = y[idx:idx]

In [367]:
clf = LogisticRegression(random_state=42, class_weight='balanced', multi_class='ovr', warm_start=True).fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [373]:
clf = LogisticRegression().fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [374]:
clf.score(test_X, test_y)

0.7611359103428625

In [377]:
import xgboost as xgb

In [396]:
clf = xgb.XGBRegressor(n_estimators=100, silent=False, n_jobs=-1)
clf.fit(train_X, train_y)

TypeError: Input data can not be a list.

# Пример работы

In [None]:
show_pred = clf.predict(show)
sh_idx = 200

for s, sy, sp, p in zip(show, show_y, show_pred, popular_topics):
    curr_word = popular_topics[s[:sh_idx].index(1)]
    foll_word = popular_topics[s[-sh_idx:].index(1)]
    previous_idx = [j for j, _ in enumerate(s[sh_idx:-sh_idx]) if _ > 0]
    previous_topic = [popular_topics[j] for j in previous_idx]
    print("Current topic:", curr_word)
    print("Сandidate topic:", foll_word)
    print("Gold label", sy)
    print("Predicted label", sp)
    print("Previous topics", previous_topic)
    print("_________")

###########################################################

###########################################################

###########################################################

In [None]:
clf = xgb.XGBRegressor(n_estimators = 100, silent = False, n_jobs  = 10)
clf.fit(x_train, y_train)

In [None]:
dups = []

for i, elem1 in enumerate(list(data.values())):
    for j, elem2 in enumerate(list(data.values())[i+1:]):
        if (elem1[0]['text'] == elem2[0]['text']) and (elem1[-1]['text'] == elem2[-1]['text']) and (elem1[1]['text'] == elem2[1]['text']):
            dups.append((elem1, elem2))

In [43]:
tuple([1, 2])

(1, 2)

In [48]:
import collections

c = collections.Counter()

pos_features_с = [tuple(el[0]) for el in pos_features]

for word in pos_features_с:
    c[word] += 1

In [251]:
triples

[['google', 'youtube', 'alabama'],
 ['michael jackson', 'shoe', 'seinfeld'],
 ['shoe', 'michael jackson', 'rugrats'],
 ['stan lee', 'dc', 'lebron'],
 ['phone', 'dc', 'detroit'],
 ['dc', 'japan', 'rap music'],
 ['japan', 'america', 'walmart'],
 ['u', 'japan', 'fox'],
 ['u', 'america', 'cell phone'],
 ['u', 'japan', 'disney'],
 ['u', 'america', 'pakistan'],
 ['canada', 'japan', 'burger king'],
 ['canada', 'america', 'house'],
 ['japan', 'marvel', 'starship trooper'],
 ['japan', 'earth', 'wonder woman'],
 ['america', 'marvel', 'fiction'],
 ['america', 'earth', 'ronald reagan'],
 ['earth', 'marvel', 'lebron james'],
 ['venus', 'uranus', 'unicef'],
 ['venus', 'sun', 'shakespeare'],
 ['uranus', 'jupiter', 'bambi'],
 ['uranus', 'moon', 'jazz'],
 ['uranus', 'sun', 'the simpson'],
 ['sun', 'jupiter', 'canada'],
 ['sun', 'moon', 'disney'],
 ['uk', 'japan', 'iphone'],
 ['library', 'tv', 'the office'],
 ['earth', 'germany', 'myspace'],
 ['japan', 'earth', 'chicago'],
 ['sun', 'earth', 'judge judy'

In [None]:
vectorizer()

In [228]:
pos_features = []
neg_features = []
triples = []

for d in dialogues:
    for i, _ in enumerate(d[:-1]):
        d_topics = list(set([e for el in d[:i+1] for e in el]))  
        res_ = [1 if r in d_topics else 0 for r in labels]
        res = list(np.array(res_)/sum(res_)) if sum(res_) > 0 else [0] * len(res_)
        curr = d[i]
        foll = d[i+1]
        for c in curr:
            for f in foll:
                if c in labels and f in labels and c != f:
                    curr_v = [1 if c == w else 0 for w in labels]
                    foll_v = [1 if f == w else 0 for w in labels]
                    final_v = curr_v + res + foll_v
                    pos_features.append((final_v, 1))
                    neg_idx = random.randint(0, len(labels)-1)
                    neg_pair = labels[neg_idx]
                    while neg_pair == f:
                        neg_idx = random.randint(0, len(labels)-1)
                        neg_pair = labels[neg_idx]
                    triples.append([c, f, neg_pair])
                    neg_v = [1 if neg_pair == w else 0 for w in labels]
                    final_neg_v = curr_v + res + neg_v
                    neg_features.append((final_neg_v, 0))

In [133]:
len(pos_features[10][0])

600

In [252]:
features = pos_features + neg_features

In [253]:
len(features)

44214

In [254]:
random.shuffle(features)

In [183]:
X = [el[0] for el in feature_pairs]
y = [el[1] for el in feature_pairs]
idx = 40000
train_X = X[:idx]
train_y = y[:idx]
test_X = X[idx:]
test_y = y[idx:]
show = X[idx:idx]
show_y = y[idx:idx]

In [401]:
clf = LogisticRegression(random_state=42, class_weight='balanced', multi_class='ovr', warm_start=True).fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [402]:
clf.score(test_X, test_y)

0.6653098145635459

In [245]:
show_pred = clf.predict(show)
sh_idx = 22

for s, sy, sp, p in zip(show, show_y, show_pred, labels):
    curr_word = labels[s[:sh_idx].index(1)]
    foll_word = labels[s[-sh_idx:].index(1)]
    previous_idx = [j for j, _ in enumerate(s[sh_idx:-sh_idx]) if _ > 0]
    previous_topic = [labels[j] for j in previous_idx]
    print("Current topic:", curr_word)
    print("Сandidate topic:", foll_word)
    print("Gold label", sy)
    print("Predicted label", sp)
    print("Previous topics", previous_topic)
    print("_________")

Current topic: person
Сandidate topic: videoname
Gold label 0
Predicted label 1
Previous topics ['softwareapplication', 'device', 'genre', 'videoname', 'person', 'location', 'organization', 'vehicle']
_________
Current topic: videoname
Сandidate topic: softwareapplication
Gold label 0
Predicted label 0
Previous topics ['device', 'videoname', 'person', 'organization']
_________
Current topic: device
Сandidate topic: person
Gold label 1
Predicted label 1
Previous topics ['softwareapplication', 'device', 'videoname', 'location', 'organization']
_________
Current topic: person
Сandidate topic: vehicle
Gold label 0
Predicted label 0
Previous topics ['genre', 'videoname', 'person']
_________
Current topic: person
Сandidate topic: videoname
Gold label 1
Predicted label 1
Previous topics ['genre', 'person']
_________
Current topic: event
Сandidate topic: videoname
Gold label 1
Predicted label 1
Previous topics ['event', 'person', 'songname']
_________
Current topic: videoname
Сandidate topic: 

In [258]:
show_pred = clf.predict(show)
sh_idx = 200

for s, sy, sp, p in zip(show, show_y, show_pred, popular_topics):
    curr_word = popular_topics[s[:sh_idx].index(1)]
    foll_word = popular_topics[s[-sh_idx:].index(1)]
    previous_idx = [j for j, _ in enumerate(s[sh_idx:-sh_idx]) if _ > 0]
    previous_topic = [popular_topics[j] for j in previous_idx]
    print("Current topic:", curr_word)
    print("Сandidate topic:", foll_word)
    print("Gold label", sy)
    print("Predicted label", sp)
    print("Previous topics", previous_topic)
    print("_________")

Current topic: basketball
Сandidate topic: kanye
Gold label 0
Predicted label 0
Previous topics ['basketball']
_________
Current topic: linkin park
Сandidate topic: country music
Gold label 1
Predicted label 0
Previous topics ['walmart', 'rock', 'country', 'country music', 'linkin park', 'amazon']
_________
Current topic: library
Сandidate topic: library of alexandria
Gold label 1
Predicted label 0
Previous topics ['library', 'mcdonalds', 'u']
_________
Current topic: netflix
Сandidate topic: dc
Gold label 0
Predicted label 0
Previous topics ['blockbuster', 'netflix']
_________
Current topic: world series
Сandidate topic: new zealand
Gold label 0
Predicted label 0
Previous topics ['yankee', 'kickball', 'softball', 'world cup', 'america', 'world series', 'canada', 'japan', 'baseball']
_________
Current topic: youtube
Сandidate topic: republican
Gold label 0
Predicted label 0
Previous topics ['tv', 'youtube']
_________
Current topic: netflix
Сandidate topic: world
Gold label 1
Predicted 

In [128]:
clf = LogisticRegression(random_state=42, class_weight='balanced', multi_class='ovr', warm_start=True).fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [190]:
test_X

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0

In [129]:
clf.score(test_X, test_y)

0.6542477456098719

In [95]:
dialogues[100]

[['internets'],
 ['Walt Disney', 'Company', 'Minnie Mouse'],
 ['disney', 'disney', 'film', 'minnie', 'she'],
 ['Mickey', 'mouse', 'Disney'],
 [],
 ['Bill Nye', 'He', 'consultant', 'Flubber'],
 ['he', 'he'],
 ['he', 'supercomputer'],
 ['7', 'movies'],
 ['55,000', 'Wall', '-', 'E', 'Walter Elias Disney'],
 ['robot', 'he', 'mustache'],
 ['he', 'Kanye West', 'Kanye', 'he', 'aids', 'man made'],
 ['Kanye', 'science'],
 ['true', 'he', 'he', 'fan', 'wheelchair', 'concerts', 'him'],
 ['he', 'entertainment', 'he', 'science'],
 ['he',
  'SNL',
  'skit',
  'he',
  'award shows',
  'two',
  'Taylor Swift',
  'award show'],
 ['he', 'his', 'depravity', 'him'],
 ['rap', 'Tupac', 'his', 'music', 'library', 'of', 'Congress', 'He'],
 ['Tupac'],
 ['he', 'Eminem', 'he', 'rapper', 'comic book artist', 'he'],
 ['he', 'the man', 'tales of the street'],
 ['Eminem',
  'judge',
  'suit',
  'Eminem',
  '2001',
  'rap',
  'judges',
  'sense of humor'],
 [],
 ['Rugrats', 'Busta Rhymes', 'it']]

In [None]:
data["t_2007cef7-7cd3-4cf6-8332-b0f4893398b0"]

In [8]:
data["t_2007cef7-7cd3-4cf6-8332-b0f4893398b0"]

[{'text': "Good evening!  Do you know much about Maryland's governor?",
  'cobot_ner': {'response': [{'text': 'Maryland', 'label': 'location'},
    {'text': 'governor', 'label': 'position'}],
   'model_version': 'v1.1'}},
 {'text': 'Not too much,  so you?  Did you know that ralph lawrence carr was the only governor to oppose the internement of japanese americans during ww2?',
  'cobot_ner': {'response': [{'text': 'ralph lawrence carr',
     'label': 'person'},
    {'text': 'governor', 'label': 'position'},
    {'text': 'internement', 'label': 'misc'},
    {'text': 'japanese americans', 'label': 'misc'},
    {'text': 'ww2', 'label': 'event'}],
   'model_version': 'v1.1'}},
 {'text': "I didn't know that.  Maryland's governor, Larry Hogan, is a republican who was under some hot water for policies on climate change.",
  'cobot_ner': {'response': [{'text': 'Maryland', 'label': 'location'},
    {'text': 'governor', 'label': 'position'},
    {'text': 'Larry Hogan', 'label': 'person'},
    {'t

In [5]:
data.keys()

dict_keys(['t_bde29ce2-4153-4056-9eb7-f4ad710505fe', 't_1abc9c37-387d-4013-8691-88ef8c010e58', 't_1a600621-5ad4-409c-a812-bc0b2bb03aa6', 't_01269680-99c3-4ab4-9df3-23901e0623c9', 't_c4f84350-a9e8-4928-bde8-5193b62388e0', 't_222ac48a-a52e-401a-a1c9-b2436edd8096', 't_38454270-bfca-4263-8513-3fb4a05fd376', 't_23942b1a-e379-41bd-826d-3e500f4f0e1a', 't_f289d280-afcd-46f5-97e1-64a9e3aedc70', 't_d61bd2b1-d554-406f-afb9-5a745a7a9ad1', 't_16c24f06-0def-4fc5-9e47-7789d93a84d4', 't_627b1554-1831-4fd3-999e-31852c1e9e3b', 't_5b13346f-0d4b-4e90-a722-a9928b88f277', 't_8af7f149-75d1-46f3-88ae-89d9a99620f2', 't_d529bdce-9d9a-429b-9e7f-7c51fdd9d1b4', 't_a767af2e-018a-4c06-beb3-1ddff22f2048', 't_ffa72db2-8ba9-4469-940c-739abf2b54bb', 't_3e6d55ea-4004-4beb-b8c7-0d03f13c8acb', 't_74482268-77d3-4163-a15e-eeff030f9a26', 't_e64823ee-4285-4296-b7c6-fa9065e0e3bb', 't_a8ed3d81-bb90-4dcc-a2ff-bd28331cc6fe', 't_a3697ee8-1a49-46e6-99c1-4b518edc3851', 't_81182dc9-142d-4047-91e1-477d59c2a25c', 't_7d121559-d936-4046-8