# TED talk transcripts

From this [Kaggle Competition](https://www.kaggle.com/rounakbanik/ted-talks/data).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import mechanicalsoup
import requests
import re
import sys
import numpy as npii
import seaborn as sns
import time
import os
from glob import glob
import scipy
import collections
import helper
import numpy as np
import copy
from itertools import chain

sns.set('notebook', style="whitegrid")
plt.rcParams['figure.figsize'] = (13, 5)
plt.rcParams['axes.titlesize'] = 16


In [None]:
df = helper.get_transcripts_with_tags_df()

## Save transcripts with urls as class as dataset

In [None]:
import pickle
X, Y = [], []

for idx, df_ in df.iterrows():
    y = df_.url_clean
    x = df_.transcript
    X.append(x)
    Y.append(y)
    
with open('data/dataset_ted_talks.npy', 'wb') as f:
    pickle.dump((X, Y), f)

## Histogram of # words per document

In [None]:
fig, ax = plt.subplots()
df.word_count.plot(kind = 'hist', bins = 120, ax = ax, title = 'Histogram of # words per document')
ax.axvline(df.word_count.median(), c = 'red');
ax.set_xlabel('Word count per document')
ax.grid('off')
fig.tight_layout()

## Histogram of # tags per document

In [None]:
fig, ax = plt.subplots()
df.num_tags.plot(kind = 'hist', bins = 70, title = 'Histogram of # tags per document', ax = ax)
ax.grid('off')
ax.set_xlabel('# tags of document')
fig.tight_layout()

## Merge tags

This tries to merge correlated tags. A tag is correlated if they co-occur in tags for a talk.

In [None]:
def get_all_tags_as_list(all_tags):
    return list(chain.from_iterable(all_tags))

def get_tags_as_set(all_tags):
    return [set(x) for x in all_tags]

def get_tag_mappings(tags):
    tag_2_idx = {tag: idx for idx, tag in enumerate(sorted(tags))}
    idx_2_tag = {idx: tag for tag, idx in tag_2_idx.items()}
    return tag_2_idx, idx_2_tag

def get_correlation_matrix(all_tags, tag_2_idx, symmetric = True):
    num_unique_tags = len(tag_2_idx.keys())
    cooccurrence_mat = scipy.sparse.lil_matrix((num_unique_tags, num_unique_tags))
    for tags in all_tags:
        for i, tag in enumerate(tags[:-1]):
            for j, tag2 in enumerate(tags[i+1:]):
                cooccurrence_mat[tag_2_idx[tag], tag_2_idx[tag2]] += 1
    cooccurrence_mat = cooccurrence_mat.todense()
    if symmetric:
        cooccurrence_mat = np.maximum(cooccurrence_mat, cooccurrence_mat.T)
    return cooccurrence_mat

def get_number_of_occurrences(tag, all_tags_list):
    number_of_occurrences = collections.Counter(all_tags_list)
    assert tag in number_of_occurrences, 'Tag "{}" not in all_tags_list'.format(tag)
    return number_of_occurrences[tag]

def merge_with_most_correlated(tag, cooccurrence_mat, tag_2_idx, idx_2_tag, correlation_treshold , all_tags_list):
    idx = tag_2_idx[tag]
    max_correlated_idx = np.argmax(cooccurrence_mat[idx])
    val = cooccurrence_mat[idx, max_correlated_idx]
    if val < correlation_treshold:
        return None
    target_tag = idx_2_tag[max_correlated_idx]
    tag_1_occ = get_number_of_occurrences(tag, all_tags_list)
    tag_2_occ = get_number_of_occurrences(target_tag, all_tags_list)
    
    if tag_1_occ > tag_2_occ:
        return None
    
    return target_tag


def get_merge_map(tags, all_tags_list, _cooccurrence_map, tag_2_idx, idx_2_tag, correlation_threshold = 3):
    merge_map = {}
    for tag in tags:
        tag_ = merge_with_most_correlated(tag, _cooccurrence_map, tag_2_idx,  idx_2_tag, correlation_treshold=correlation_threshold, all_tags_list = all_tags_list)
        merge_map[tag] = tag_
    return merge_map

def invert_merge_map(merge_map):
    out = collections.defaultdict(lambda: [])
    for from_, to_ in merge_map.items():
        out[to_].append(from_)
    return out

def get_labels_after_merge(labels, merge_map):
    out = []
    for tags in labels:
        tags = set(tags)
        for _from, _to in merge_map.items():
            if _from in tags:
                tags.remove(_from, )
                if _to is not None:
                    tags.add(_to)
        out.append(tags)
    return out

def get_unique_tags(all_tags):
    t = set()
    for tags in all_tags:
        t |= set(tags)
    return t


In [None]:
all_tags = df.tags
all_tags_list = get_all_tags_as_list(all_tags)
all_tags_set = get_tags_as_set(all_tags)
sorted_tags = sorted(list(set(all_tags_list)))
tag_2_idx, idx_2_tag = get_tag_mappings(sorted_tags)
cooccurrence_mat = get_correlation_matrix(all_tags, tag_2_idx)

In [None]:
def get_merges(num_iterations, correlation_threshold, _all_tags_set):
    _all_tags_list = [list(x) for x in _all_tags_set]
    _all_tags_flattened = get_all_tags_as_list(_all_tags_list)
    _sorted_tags = list(get_unique_tags(_all_tags_set))
    _tag_2_idx, _idx_2_tag = get_tag_mappings(_sorted_tags)
    _cooccurrence_map = get_correlation_matrix(all_tags = _all_tags_list, tag_2_idx=_tag_2_idx)
    for i in range(num_iterations):
        merge_map = get_merge_map(_sorted_tags, _all_tags_flattened, _cooccurrence_map, _tag_2_idx, _idx_2_tag, correlation_threshold=correlation_threshold)
        inverted_merge_map = invert_merge_map(merge_map)
        new_labels = get_labels_after_merge(_all_tags_set, merge_map)
        _sorted_tags = list(get_unique_tags(new_labels))
        _all_tags_list = get_all_tags_as_list(new_labels)
        _all_tags_set = [set(x) for x in new_labels]
        _tag_2_idx, _idx_2_tag = get_tag_mappings(_sorted_tags)
        _cooccurrence_map = get_correlation_matrix(all_tags = [list(x) for x in new_labels], tag_2_idx=_tag_2_idx)
    return _sorted_tags, _all_tags_list

_sorted_tags_, _all_tags_list = get_merges(
    _all_tags_set=copy.copy(all_tags_set),
    num_iterations=2,
    correlation_threshold=10
)

counter = collections.Counter()
for tags in _all_tags_list:
    counter[len(tags)] += 1

fig, ax = plt.subplots()
pd.DataFrame(_all_tags_list, columns = ['tag']).tag.value_counts().to_frame().tag.plot(kind = 'bar', ax = ax)
fig.tight_layout()
df__ = pd.DataFrame(list(counter.items()), columns = ('num_tags', 'num_docs')).set_index('num_tags').sort_index()
print(df__.num_docs.sum())
df__

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
ax.imshow(cooccurrence_mat, cmap = plt.get_cmap('cubehelix'))
ax.grid('off')
fig.tight_layout()

In [None]:
correlations_idxmax = np.argsort(correlations)
correlations = np.squeeze(np.asarray(correlations))

In [None]:
def get_correlated_tags(tag):
    idx = tag_2_idx[tag]
    row = np.asarray(cooccurrence_mat[idx])[0]
    tag_indices_sorted = np.argsort(row)
    return list(reversed([(idx_2_tag[x], row[x]) for x in tag_indices_sorted if row[x] > 0]))

with open('data/correlated_tags.txt', 'w') as f:
    for tag in reversed(correlations_idxmax):
        tag = idx_2_tag[tag]
        f.write('{}\n'.format(tag))
        correlated_tags = get_correlated_tags(tag)
        els = min(10, len(correlated_tags))
        for t in correlated_tags[:els]:
            f.write('\t{}\n'.format(t))
    f.write('\n')

## Remove un-frequent tags

In [None]:
all_tags = list(chain.from_iterable(df.tags.values))
tag_counts = collections.Counter(all_tags)

df_tag_counts = pd.DataFrame(list(tag_counts.items()), columns=['label', 'occurrences'])

lim = (0, df_tag_counts.occurrences.max())

ax = df_tag_counts.occurrences.plot(kind='hist', bins=300, title='Histogram of tag occurrences')
ax.set_xlabel('number of occurrences of single tag');
ax.set_xlim(*lim)


too_frequent = df_tag_counts.occurrences.quantile(0.999)
too_unfrequent = df_tag_counts.occurrences.quantile(0.01)

too_frequent = 1000
too_unfrequent = 10

mask_clipped = (df_tag_counts.occurrences < too_frequent) & (df_tag_counts.occurrences > too_unfrequent)
df_tag_counts_clipped = df_tag_counts[mask_clipped]


tags_unwanted = set(df_tag_counts[mask_clipped == False].label.values)

fig, ax = plt.subplots()
df_tag_counts_clipped.occurrences.plot(kind='hist', bins=300, ax=ax)
ax.set_xlim(*lim);


num_tags = len(df_tag_counts)
num_tags_unwanted = len(tags_unwanted)
print('# tags:\t\t\t{}'.format(num_tags))
print('# unwanted tags:\t{}'.format(num_tags_unwanted))
print('# after tags:\t\t{}'.format(num_tags - num_tags_unwanted))

In [None]:
df['tags_clean'] = df.tags.apply(lambda x: set(x) - tags_unwanted)
df['num_tags_clean'] = df.tags_clean.apply(len)

assert not len(df[df.num_tags_clean == 0])

## Most often tags

In [None]:
n_top = 30
most_often = df_tag_counts.sort_values('occurrences')[-n_top:].set_index('label').sort_index()
display(most_often.T)
most_often_tags = most_often.index.values
most_often_ids = [tag_2_idx[label] for label in most_often_tags]
most_often_occs = most_often.occurrences.values
indices = np.ix_(most_often_ids, most_often_ids)
most_often_coo = cooccurrence_mat[indices]

ind = list(range(n_top))
most_often_coo[ind, ind] = most_often_occs

most_often_coo /= most_often_occs

In [None]:
def plot_confusion_matrix(cm,
                          classes=None,
                          normalize=True,
                          title='Confusion matrix',
                          round_confusion=2,
                          x_rotation=90,
                          show_non_horizontal_percent=True):
    import matplotlib.pyplot as plt
    import itertools
    """
    Plots the confusion matrix.
    Taken from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
    """
    fig, ax = plt.subplots()
    cmap = plt.cm.Blues
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    fig.colorbar(im)
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=x_rotation)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if not show_non_horizontal_percent and i != j:
            continue
        val = int(round(cm[i, j], round_confusion) * 100) if round_confusion else cm[i, j]
        val = '{}%'.format(val)
        ax.text(j, i, val,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    ax.grid(False)
    fig.tight_layout()
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')


In [None]:
correlations = np.squeeze(np.asarray(most_often_coo.sum(axis = 1)))

correlations = sorted(list(zip(most_often_tags, correlations)), key=lambda x: x[1])
used_tags, _ = zip(*correlations[:10])

most_often_tags = used_tags
most_often_ids = [tag_2_idx[label] for label in most_often_tags]
most_often_occs = [most_often.loc[x].occurrences for x in most_often_tags]

indices = np.ix_(most_often_ids, most_often_ids)
most_often_coo = cooccurrence_mat[indices]
most_often_coo /= most_often_occs

In [None]:
text_dict = fontdict=dict(horizontalalignment='center', verticalalignment='center')
fig, ax = plt.subplots(figsize=(20, 20))
im = ax.imshow(most_often_coo, cmap=plt.get_cmap('hot_r'))
fig.colorbar(im)
ax.grid(False)
for row_idx, row in enumerate(np.asarray(most_often_coo)):
    for cell_idx, cell in enumerate(row):        
        if row_idx == cell_idx:
            text = most_often_occs[row_idx]
        else:
            text = '{:.0f}%'.format(cell * 100)
        ax.text(row_idx, cell_idx, text, text_dict, color = 'blue')

ax.set_xticks([])
ax.set_yticks([])
for idx, tag in enumerate(most_often_tags):
    ax.text(idx, -0.8, tag, fontdict=text_dict)
    ax.text(-0.6, idx, tag, fontdict=dict(text_dict, **dict(horizontalalignment='right')))

In [None]:
allowed_tags = ['entertainment', 'health', 'innovation']
#allowed_tags = ['entertainment', 'tedx']

allowed_tags = ['economics', 'environment', 'brain', 'entertainment']

for tag in allowed_tags:
    assert tag in df_tag_counts.values
    print('{:22} {}'.format(tag, most_often[most_often.index == tag].occurrences.values[0]))

def get_common_label(labels, allowed_labels = allowed_tags):
    common_labels =  list(set(labels)  & set(allowed_labels))
    assert len(common_labels) == 1
    return common_labels[0]
    
def filter_tags(tags):
    return len(set(tags)  & set(allowed_tags)) == 1
    
df_filtered = df[df.tags.apply(filter_tags)]
df_filtered['label'] = df_filtered.tags.apply(get_common_label)

print('\n\nElements after filter: {}'.format(len(df_filtered)))

vals = df_filtered[['url_clean', 'label', 'transcript']].rename(columns={'url_clean': 'url'}).set_index('url')

with open('data/df_dataset.npy', 'wb') as f:
    pickle.dump(vals, f)

In [None]:
import sklearn
import sklearn.feature_extraction

X = df_filtered.transcript.values
Y = df_filtered.label.values

pipeline = sklearn.pipeline.Pipeline([
    ('vectorizer', None),
    ('classifier', None)
])

param_grid = dict(
    #vectorizer=[sklearn.feature_extraction.text.CountVectorizer(), sklearn.feature_extraction.text.TfidfVectorizer()],
    vectorizer=[sklearn.feature_extraction.text.TfidfVectorizer()],
    classifier=[sklearn.svm.LinearSVC(class_weight='balanced', C=1), sklearn.svm.LinearSVC(class_weight='balanced', C=0.1)],
    classifier__C=[1e-1, 1]
)

dummy_clf = sklearn.dummy.DummyClassifier()
dummy_clf.fit([[0]] * len(Y), Y)
Y_pred_dummy = dummy_clf.predict([[0]] * len(Y))
dummy_score = sklearn.metrics.f1_score(Y, Y_pred_dummy, average='macro')

gscv = sklearn.model_selection.GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=0)

gscv_result = gscv.fit(X, Y)
Y_pred = gscv.predict(X)

In [None]:
display(allowed_tags)
print('#elements {}'.format(len(df_filtered)))
display(dummy_score)
pd.DataFrame(gscv_result.cv_results_)[['param_classifier', 'param_vectorizer', 'mean_test_score', 'mean_train_score']]

In [None]:
classes = sorted(allowed_tags)
cm = sklearn.metrics.confusion_matrix(Y, Y_pred, labels = classes)
plot_confusion_matrix(cm, classes=classes)

## Clustering

In [None]:
all_tags_clean = list(chain.from_iterable(df.tags_clean.values))
all_tags_ = df.tags_clean.values

def get_tag_vector(tags, idx, mat, mapping=tag_2_idx):
    non_zero_elements = [mapping[t] for t in tags]
    mat[idx, non_zero_elements] = 1
    
mapping = tag_2_idx
num_unique_labels = len(mapping)
tag_mat = scipy.sparse.lil_matrix((len(all_tags_), num_unique_labels), dtype=bool)
tag_vectors = [get_tag_vector(t, idx, tag_mat) for idx, t in enumerate(all_tags_)]
tag_mat = tag_mat.tocsr()
assert len(tag_mat.nonzero()[0]) == len(all_tags_clean)
assert len(df) == tag_mat.shape[0]

In [None]:
fig, ax = plt.subplots()
ax.imshow(tag_mat.todense())
fig.tight_layout()
ax.grid(False)

In [None]:
import sklearn
import sklearn.feature_extraction

results = {}
X = df.transcript.values
X_vec = sklearn.feature_extraction.text.CountVectorizer().fit_transform(X)
X_vec_tfidf = sklearn.feature_extraction.text.TfidfVectorizer().fit_transform(X)

In [None]:
import sklearn.svm, sklearn.pipeline, sklearn.feature_extraction, sklearn.model_selection, sklearn.cluster, sklearn.dummy

cluster_clf = sklearn.cluster.KMeans(n_init=300, max_iter=1000)

for n_clusters in [3]:
    print('Starting:', n_clusters)
    cluster_clf.set_params(n_clusters=n_clusters)
    Y = cluster_clf.fit_predict(tag_mat)
    
    pd.DataFrame(list(collections.Counter(Y).items()), columns=['label', 'occurrences']).set_index('label').sort_index().occurrences.plot(kind='barh')
    plt.show()
    print('Fitted clusters')
    pipeline = sklearn.pipeline.Pipeline([
        ('classifier', None)
    ])

    param_grid = dict(
        classifier=[sklearn.svm.LinearSVC(class_weight='balanced', C=1), sklearn.dummy.DummyClassifier()]
    )
    
    gscv = sklearn.model_selection.GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=0)

    gscv_result = gscv.fit(X_vec, Y)
    display(pd.DataFrame(gscv_result.cv_results_))
    Y_pred = gscv.predict(X_vec)
    results[n_clusters] = gscv_result
    print(n_clusters, gscv_result.best_score_)