# Ted Talk Topic Extraction with [NMF](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

In [2]:
# read data into pandas dataframes
transcript_df = pd.read_csv('data/transcripts.csv')
main_df = pd.read_csv('data/ted_main.csv', parse_dates=['film_date', 'published_date'],
                      converters={'ratings': literal_eval, 'tags': literal_eval, 'related_talks': literal_eval})

In [3]:
#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [4]:
# set model parameters
MIN_DF = 0.1
MAX_DF = 0.2
NUM_TOPICS = 10
MORE_STOPWORDS = set(['talk', 'applause', 'laughter', 'yeah', 'oh', 'guy'])

In [5]:
# compile stopwords and create TFIDF vectorizer for transcript data
en_stop = en_stop | MORE_STOPWORDS
vectorizer = TfidfVectorizer(stop_words=en_stop,
                             use_idf=True,
                             ngram_range=(1, 1),
                             min_df=MIN_DF,
                             max_df=MAX_DF)

tfidf = vectorizer.fit_transform(transcript_df['transcript'])

# Fit NMF model

In [6]:
# create, fit nmf model and transform tfidf vectorizer
nmf = NMF(n_components=NUM_TOPICS, random_state=42)
topics = nmf.fit_transform(tfidf)

# print top 5 words associated with each fitted topic
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(nmf.components_):
    t_words[t_id] = [vectorizer.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
for k, v in t_words.items():
    print(k,v)

0 ['father', 'god', 'war', 'girl', 'parents']
1 ['music', 'sound', 'playing', 'sounds', 'audience']
2 ['species', 'animals', 'planet', 'sea', 'animal']
3 ['cancer', 'disease', 'medical', 'blood', 'hospital']
4 ['africa', 'economic', 'companies', 'economy', 'china']
5 ['computer', 'machine', 'internet', 'digital', 'computers']
6 ['cities', 'car', 'cars', 'street', 'driving']
7 ['universe', 'theory', 'sun', 'planet', 'black']
8 ['students', 'education', 'learning', 'language', 'schools']
9 ['cells', 'cell', 'blood', 'disease', 'lab']


# Testing out NMF fit on 5 randomly sampled talks

In [7]:
# create sklearn pipeline for simple fit+transform of talks
pipe = Pipeline([
    ('tfidf', vectorizer),
    ('nmf', nmf)
])

# test out on 5 randomly sampled talks
rand_doc_ids = np.random.randint(0,len(transcript_df),5)
for doc_id in rand_doc_ids:
    t = pipe.transform([transcript_df['transcript'].iloc[doc_id]]) 
    print('Predicted most likely topic: ', t_words[np.argmax(t)])
    talk_data = main_df[main_df.url == transcript_df.iloc[doc_id]['url']]
    print('Talk Title:', talk_data.title.values)
    print('Actual Talk Tags:', talk_data.tags.values)
    print('----------------------------------------------------')

Predicted most likely topic:  ['father', 'god', 'war', 'girl', 'parents']
Talk Title: ['How to speak so that people want to listen']
Actual Talk Tags: [list(['culture', 'sound', 'speech'])]
----------------------------------------------------
Predicted most likely topic:  ['computer', 'machine', 'internet', 'digital', 'computers']
Talk Title: ['How to make a splash in social media']
Actual Talk Tags: [list(['Internet', 'animals', 'business', 'culture', 'entertainment', 'entrepreneur', 'oceans', 'web'])]
----------------------------------------------------
Predicted most likely topic:  ['cities', 'car', 'cars', 'street', 'driving']
Talk Title: ['The ghastly tragedy of the suburbs']
Actual Talk Tags: [list(['alternative energy', 'architecture', 'cars', 'cities', 'consumerism', 'culture', 'design', 'energy', 'transportation'])]
----------------------------------------------------
Predicted most likely topic:  ['father', 'god', 'war', 'girl', 'parents']
Talk Title: ['Drawing on humor for c


# applying topic extraction to every talk

In [8]:
# define helper functions to retrieve topic and topic id for each talk in DF
def get_topic(transcript):
    t = pipe.transform([transcript])
    topic = t_words[np.argmax(t)]
    return topic

def get_topic_id(transcript):
    t = pipe.transform([transcript])
    return np.argmax(t)

In [9]:
# get topic weights for each transcript
weights_df = pd.DataFrame(pipe.transform(transcript_df.transcript))
weights_df.columns = ['t1_weight',
                      't2_weight',
                      't3_weight',
                      't4_weight',
                      't5_weight',
                      't6_weight',
                      't7_weight',
                      't8_weight',
                      't9_weight',
                      't10_weight']

In [10]:
# map helper functions to entire df
transcript_df['topic_pred'] = transcript_df.transcript.map(get_topic)
transcript_df['topic_pred_id'] = transcript_df.transcript.map(get_topic_id)

In [11]:
transcript_df = transcript_df.merge(weights_df, left_index=True, right_index=True)

In [12]:
# join on data from main_df and display top 5 rows
merged_df = main_df.merge(transcript_df, on='url')
merged_df.drop('transcript', axis=1, inplace=True)
merged_df.head(5)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,t1_weight,t2_weight,t3_weight,t4_weight,t5_weight,t6_weight,t7_weight,t8_weight,t9_weight,t10_weight
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,...,0.06342,0.034262,0.00216,0.0,0.0,0.0,0.0,0.0,0.170427,0.0
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,...,0.027531,0.002607,0.023396,0.0,0.056169,0.025449,0.120641,0.0,0.0,0.0
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,...,0.069503,0.059341,0.0,0.0,0.009043,0.133906,0.000362,0.0,0.0,0.004225
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,...,0.042297,0.00136,0.017086,0.0,0.089942,0.0,0.087356,0.007359,0.012474,0.0
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,...,0.0,0.0,1.4e-05,0.0,0.166522,0.0,0.0,0.0,0.062717,0.0


# Data cleaning

In [13]:
# get year for each talk
merged_df['film_year'] = pd.to_datetime(merged_df.film_date, unit='s').dt.year

In [14]:
# get the top (most-selected) rating for each talk
def get_top_rating(rating):
    counts = [r['count'] for r in rating]
    return rating[np.argmax(counts)]['name']
merged_df['top_rating'] = merged_df.ratings.map(get_top_rating)

In [15]:
# show counts of each top rating value
merged_df.groupby(['top_rating'])[['url']].count()

Unnamed: 0_level_0,url
top_rating,Unnamed: 1_level_1
Beautiful,142
Confusing,2
Courageous,82
Fascinating,251
Funny,159
Informative,711
Ingenious,101
Inspiring,851
Jaw-dropping,49
Longwinded,8


In [16]:
# show counts of each predicted topic
merged_df.groupby(['topic_pred_id'])[['url']].count()

Unnamed: 0_level_0,url
topic_pred_id,Unnamed: 1_level_1
0,510
1,145
2,240
3,95
4,362
5,518
6,195
7,95
8,200
9,107


In [17]:
# each row is a topic, each column is an associated word
pd.DataFrame(t_words).T

Unnamed: 0,0,1,2,3,4
0,father,god,war,girl,parents
1,music,sound,playing,sounds,audience
2,species,animals,planet,sea,animal
3,cancer,disease,medical,blood,hospital
4,africa,economic,companies,economy,china
5,computer,machine,internet,digital,computers
6,cities,car,cars,street,driving
7,universe,theory,sun,planet,black
8,students,education,learning,language,schools
9,cells,cell,blood,disease,lab


In [18]:
# IGNORE
"""
# Emulate this with d3 for lollipop chart
topic_dict = {}
for i in range(len(t_words)):
    tag_dict = {}
    df_filtered = merged_df.loc[merged_df.topic_pred_id == i]
    for tag_list in list(df_filtered.tags):
        for tag in tag_list:
            if tag in tag_dict:
                tag_dict[tag] += 1
            else:
                tag_dict[tag] = 1
    top_ten_tags = heapq.nlargest(10, tag_dict.items(), key=operator.itemgetter(1))
    topic_dict[i] = top_ten_tags
topic_dict

# create demo topic/tag data for lollipop chart prototype, then write out to csv
demo_topic_tag_df = pd.DataFrame(topic_dict[0])
demo_topic_tag_df.columns = ['tag', 'count']
demo_topic_tag_df.to_csv('data/demo_topic_tag.csv')
"""

"\n# Emulate this with d3 for lollipop chart\ntopic_dict = {}\nfor i in range(len(t_words)):\n    tag_dict = {}\n    df_filtered = merged_df.loc[merged_df.topic_pred_id == i]\n    for tag_list in list(df_filtered.tags):\n        for tag in tag_list:\n            if tag in tag_dict:\n                tag_dict[tag] += 1\n            else:\n                tag_dict[tag] = 1\n    top_ten_tags = heapq.nlargest(10, tag_dict.items(), key=operator.itemgetter(1))\n    topic_dict[i] = top_ten_tags\ntopic_dict\n\n# create demo topic/tag data for lollipop chart prototype, then write out to csv\ndemo_topic_tag_df = pd.DataFrame(topic_dict[0])\ndemo_topic_tag_df.columns = ['tag', 'count']\ndemo_topic_tag_df.to_csv('data/demo_topic_tag.csv')\n"

In [19]:
merged_df.to_csv('data/labelled_data.csv')