# TED talk transcripts

From this [Kaggle Competition](https://www.kaggle.com/rounakbanik/ted-talks/data).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import mechanicalsoup
import requests
import re
import sys
import numpy as np
import seaborn as sns
import time
import os
from glob import glob
import scipy
import collections
import helper
import numpy as np
import copy

sns.set('notebook', style="whitegrid")
plt.rcParams['figure.figsize'] = (16, 5)
plt.rcParams['axes.titlesize'] = 16

HTML_FOLDER = 'data/html'

In [None]:
df = pd.read_csv('data/transcripts.csv')
def get_word_count(t):
    return len(t.split(' '))
df['word_count'] = df.transcript.apply(get_word_count)
df['url'] = df.url.str.strip()
df.url.drop_duplicates(inplace=True)

## Retrieve tags and html for the transcripts

In [None]:
def get_html_data(folder = HTML_FOLDER):
    html_data = collections.defaultdict(lambda: [])
    for file in glob('{}/*.html'.format(folder)):
        with open(file) as f:
            html = f.read()
        url, html = [x.strip() for x in html.split('\n\n', 1)]
        html_data['url'].append(url)
        html_data['html'].append(html)
    df_html = pd.DataFrame(html_data)
    df_html.url.drop_duplicates(inplace=True)
    return df_html

df_html = get_html_data()

df_ = df.merge(right = df_html, on = 'url', validate = 'one_to_one')
df_['tags'] = df_.html.apply(helper.get_tags_from_html)
df_['num_tags'] = df_.tags.apply(len)

In [None]:
fig, ax = plt.subplots()
df.word_count.plot(kind = 'hist', bins = 120, ax = ax, title = 'Histogram of #words per document')
ax.axvline(df.word_count.median(), c = 'red');
ax.set_xlabel('Word count per document')
ax.grid('off')
fig.tight_layout()

In [None]:
fig, ax = plt.subplots()
df_.num_tags.plot(kind = 'hist', bins = 70, title = 'Histogram of # tags per document', ax = ax)
ax.grid('off')
ax.set_xlabel('# tags of document')
fig.tight_layout()

## Merge tags

In [None]:
def get_all_tags_as_list(all_tags):
    all_tags_list = []
    for tags in all_tags:
        all_tags_list += list(tags)
    return all_tags_list

def get_tags_as_set(all_tags):
    return [set(x) for x in all_tags]

def get_tag_mappings(tags):
    tag_2_idx = {tag: idx for idx, tag in enumerate(sorted(tags))}
    idx_2_tag = {idx: tag for tag, idx in tag_2_idx.items()}
    return tag_2_idx, idx_2_tag

def get_correlation_matrix(all_tags, tag_2_idx, symmetric = True):
    num_unique_tags = len(tag_2_idx.keys())
    cooccurrence_mat = scipy.sparse.lil_matrix((num_unique_tags, num_unique_tags))
    for tags in all_tags:
        for i, tag in enumerate(tags[:-1]):
            for j, tag2 in enumerate(tags[i+1:]):
                cooccurrence_mat[tag_2_idx[tag], tag_2_idx[tag2]] += 1
    cooccurrence_mat = cooccurrence_mat.todense()
    if symmetric:
        cooccurrence_mat = np.maximum(cooccurrence_mat, cooccurrence_mat.T)
    return cooccurrence_mat

def get_number_of_occurrences(tag, all_tags_list):
    number_of_occurrences = collections.Counter(all_tags_list)
    assert tag in number_of_occurrences, 'Tag "{}" not in all_tags_list'.format(tag)
    return number_of_occurrences[tag]

def merge_with_most_correlated(tag, cooccurrence_mat, tag_2_idx, idx_2_tag, correlation_treshold , all_tags_list):
    idx = tag_2_idx[tag]
    max_correlated_idx = np.argmax(cooccurrence_mat[idx])
    val = cooccurrence_mat[idx, max_correlated_idx]
    if val < correlation_treshold:
        return None
    target_tag = idx_2_tag[max_correlated_idx]
    tag_1_occ = get_number_of_occurrences(tag, all_tags_list)
    tag_2_occ = get_number_of_occurrences(target_tag, all_tags_list)
    
    if tag_1_occ > tag_2_occ:
        return None
    
    return target_tag


def get_merge_map(tags, all_tags_list, _cooccurrence_map, tag_2_idx, idx_2_tag, correlation_threshold = 3):
    merge_map = {}
    for tag in tags:
        tag_ = merge_with_most_correlated(tag, _cooccurrence_map, tag_2_idx,  idx_2_tag, correlation_treshold=correlation_threshold, all_tags_list = all_tags_list)
        merge_map[tag] = tag_
    return merge_map

def invert_merge_map(merge_map):
    out = collections.defaultdict(lambda: [])
    for from_, to_ in merge_map.items():
        out[to_].append(from_)
    return out

def get_labels_after_merge(labels, merge_map):
    out = []
    for tags in labels:
        tags = set(tags)
        for _from, _to in merge_map.items():
            if _from in tags:
                tags.remove(_from, )
                if _to is not None:
                    tags.add(_to)
        out.append(tags)
    return out

def get_unique_tags(all_tags):
    t = set()
    for tags in all_tags:
        t |= set(tags)
    return t


In [None]:
all_tags = df_.tags
all_tags_list = get_all_tags_as_list(all_tags)
all_tags_set = get_tags_as_set(all_tags)
sorted_tags = sorted(list(set(all_tags_list)))
tag_2_idx, idx_2_tag = get_tag_mappings(sorted_tags)
cooccurrence_mat = get_correlation_matrix(all_tags, tag_2_idx)

In [None]:
def get_merges(num_iterations, correlation_threshold, _all_tags_set):
    _all_tags_list = [list(x) for x in _all_tags_set]
    _all_tags_flattened = get_all_tags_as_list(_all_tags_list)
    _sorted_tags = list(get_unique_tags(_all_tags_set))
    _tag_2_idx, _idx_2_tag = get_tag_mappings(_sorted_tags)
    _cooccurrence_map = get_correlation_matrix(all_tags = _all_tags_list, tag_2_idx=_tag_2_idx)
    for i in range(num_iterations):
        merge_map = get_merge_map(_sorted_tags, _all_tags_flattened, _cooccurrence_map, _tag_2_idx, _idx_2_tag, correlation_threshold=correlation_threshold)
        inverted_merge_map = invert_merge_map(merge_map)
        new_labels = get_labels_after_merge(_all_tags_set, merge_map)
        _sorted_tags = list(get_unique_tags(new_labels))
        _all_tags_list = get_all_tags_as_list(new_labels)
        _all_tags_set = [set(x) for x in new_labels]
        _tag_2_idx, _idx_2_tag = get_tag_mappings(_sorted_tags)
        _cooccurrence_map = get_correlation_matrix(all_tags = [list(x) for x in new_labels], tag_2_idx=_tag_2_idx)
    return _sorted_tags, _all_tags_list

_sorted_tags_, _all_tags_list = get_merges(
    _all_tags_set=copy.copy(all_tags_set),
    num_iterations=2,
    correlation_threshold=16
)

counter = collections.Counter()
for tags, (idx, df__) in zip(_all_tags_list, df_.iterrows()):
    counter[len(tags)] += 1

fig, ax = plt.subplots()
pd.DataFrame(_all_tags_list, columns = ['tag']).tag.value_counts().to_frame().tag.plot(kind = 'bar', ax = ax)
fig.tight_layout()
pd.DataFrame(list(counter.items()), columns = ('num_tags', 'num_docs')).set_index('num_tags').sort_index()

In [None]:
correlations = np.sum(cooccurrence_mat, axis = 1)
for i, tag in enumerate(_sorted_tags[:-1]):
    for tag2 in _sorted_tags[i:]:
        if tag == tag2: continue
        max_ = np.max(cooccurrence_mat[tag_2_idx[tag]])
        print(tag, tag2, max_, cooccurrence_mat[tag_2_idx[tag], tag_2_idx[tag2]])

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
ax.imshow(cooccurrence_mat, cmap = plt.get_cmap('cubehelix'))
ax.grid('off')
fig.tight_layout()

In [None]:
correlations_idxmax = np.argsort(correlations)
correlations = np.squeeze(np.asarray(correlations))
print([(idx_2_tag[idx], correlations[idx]) for idx in correlations_idxmax[-10:]])

In [None]:
def get_correlated_tags(tag):
    idx = tag_2_idx[tag]
    row = np.asarray(cooccurrence_mat[idx])[0]
    tag_indices_sorted = np.argsort(row)
    return list(reversed([(idx_2_tag[x], row[x]) for x in tag_indices_sorted if row[x] > 0]))

with open('data/correlated_tags.txt', 'w') as f:
    for tag in reversed(correlations_idxmax):
        tag = idx_2_tag[tag]
        f.write('{}\n'.format(tag))
        correlated_tags = get_correlated_tags(tag)
        els = min(10, len(correlated_tags))
        for t in correlated_tags[:els]:
            f.write('\t{}\n'.format(t))
    f.write('\n')

In [None]:
correlations = np.squeeze(np.asarray(np.sum(cooccurrence_mat, axis = 1)))
def plot_correlations(correlations, log = False):
    ax = pd.DataFrame(correlations, columns = ['correlation']).sort_values('correlation').plot(kind = 'bar', logy = log)
    ax.grid('off')
    ax.set_xticks([])
    return ax

plot_correlations(correlations / 2)
plot_correlations(correlations[(correlations > 50) & (correlations < 150)] / 2)