# Ted Talk Topic Extraction with NMF

In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import heapq
import heapq
import operator
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

In [3]:
transcript_df = pd.read_csv('data/transcripts.csv')
main_df = pd.read_csv('data/ted_main.csv', parse_dates=['film_date', 'published_date'],
                      converters={'ratings': literal_eval, 'tags': literal_eval, 'related_talks': literal_eval})

In [4]:
#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [5]:
# parameters to play with
MIN_DF = 0.1
MAX_DF = 0.2
NUM_TOPICS = 10
MORE_STOPWORDS = set(['talk', 'applause', 'laughter', 'yeah', 'oh', 'guy'])

In [6]:
min_grid = np.arange(0.1,1.0,0.1)
max_grid = np.arange(0.1,1.0,0.1)
topics_grid = [5, 7, 9, 11]

In [7]:
en_stop = en_stop | MORE_STOPWORDS
vectorizer = TfidfVectorizer(stop_words=en_stop,
                             use_idf=True,
                             ngram_range=(1, 1),
                             min_df=MIN_DF,
                             max_df=MAX_DF)

tfidf = vectorizer.fit_transform(transcript_df['transcript'])

# Fit NMF model

In [8]:
nmf = NMF(n_components=NUM_TOPICS,
          random_state=42)

topics = nmf.fit_transform(tfidf)
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(nmf.components_):
    t_words[t_id] = [vectorizer.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
for k, v in t_words.items():
    print(k,v)

0 ['father', 'god', 'war', 'girl', 'parents']
1 ['music', 'sound', 'playing', 'sounds', 'audience']
2 ['species', 'animals', 'planet', 'sea', 'animal']
3 ['cancer', 'disease', 'medical', 'blood', 'hospital']
4 ['africa', 'economic', 'companies', 'economy', 'china']
5 ['computer', 'machine', 'internet', 'digital', 'computers']
6 ['cities', 'car', 'cars', 'street', 'driving']
7 ['universe', 'theory', 'sun', 'planet', 'black']
8 ['students', 'education', 'learning', 'language', 'schools']
9 ['cells', 'cell', 'blood', 'disease', 'lab']


# Testing out NMF fit on 5 randomly sampled talks

In [9]:
pipe = Pipeline([
    ('tfidf', vectorizer),
    ('nmf', nmf)
])

rand_doc_ids = np.random.randint(0,len(transcript_df),5)

for doc_id in rand_doc_ids:
    t = pipe.transform([transcript_df['transcript'].iloc[doc_id]]) 
    print('Predicted most likely topic: ', t_words[np.argmax(t)])
    talk_data = main_df[main_df.url == transcript_df.iloc[doc_id]['url']]
    print('Talk Title:', talk_data.title.values)
    print('Actual Talk Tags:', talk_data.tags.values)
    print('----------------------------------------------------')

Predicted most likely topic:  ['universe', 'theory', 'sun', 'planet', 'black']
Talk Title: ['How I fell in love with quasars, blazars and our incredible universe']
Actual Talk Tags: [list(['NASA', 'TED Fellows', 'astronomy', 'dark matter', 'math', 'physics', 'science', 'telescopes', 'universe'])]
----------------------------------------------------
Predicted most likely topic:  ['students', 'education', 'learning', 'language', 'schools']
Talk Title: ['A teacher growing green in the South Bronx']
Actual Talk Tags: [list(['TEDx', 'business', 'education', 'garden', 'green'])]
----------------------------------------------------
Predicted most likely topic:  ['computer', 'machine', 'internet', 'digital', 'computers']
Talk Title: ['1,000 TED Talks in six words']
Actual Talk Tags: [list(['TEDx', 'statistics', 'visualizations'])]
----------------------------------------------------
Predicted most likely topic:  ['computer', 'machine', 'internet', 'digital', 'computers']
Talk Title: ["Governme

# applying topic extraction to every talk

In [81]:
def get_topic(transcript):
    t = pipe.transform([transcript])
    topic = t_words[np.argmax(t)]
    return topic

def get_topic_id(transcript):
    t = pipe.transform([transcript])
    return np.argmax(t)

In [82]:
transcript_df['topic_pred'] = transcript_df.transcript.map(get_topic)
transcript_df['topic_pred_id'] = transcript_df.transcript.map(get_topic_id)

In [84]:
merged_df = main_df.merge(transcript_df, on='url')
merged_df.drop('transcript', axis=1, inplace=True)

In [85]:
merged_df[['title', 'description', 'topic_pred', 'topic_pred_id']][:10]

Unnamed: 0,title,description,topic_pred,topic_pred_id
0,Do schools kill creativity?,Sir Ken Robinson makes an entertaining and pro...,"[students, education, learning, language, scho...",8
1,Averting the climate crisis,With the same humor and humanity he exuded in ...,"[cities, car, cars, street, driving]",6
2,Simplicity sells,New York Times columnist David Pogue takes aim...,"[computer, machine, internet, digital, computers]",5
3,Greening the ghetto,"In an emotionally charged talk, MacArthur-winn...","[africa, economic, companies, economy, china]",4
4,The best stats you've ever seen,You've never seen data presented like this. Wi...,"[africa, economic, companies, economy, china]",4
5,Why we do what we do,"Tony Robbins discusses the ""invisible forces"" ...","[computer, machine, internet, digital, computers]",5
6,Letting go of God,When two young Mormon missionaries knock on Ju...,"[father, god, war, girl, parents]",0
7,Behind the design of Seattle's library,Architect Joshua Prince-Ramus takes the audien...,"[computer, machine, internet, digital, computers]",5
8,Let's teach religion -- all religion -- in sch...,Philosopher Dan Dennett calls for religion -- ...,"[father, god, war, girl, parents]",0
9,A life of purpose,"Pastor Rick Warren, author of ""The Purpose-Dri...","[father, god, war, girl, parents]",0


# Data cleaning

In [86]:
merged_df['film_year'] = pd.to_datetime(merged_df.film_date, unit='s').dt.year

In [87]:
def get_top_rating(rating):
    counts = [r['count'] for r in rating]
    return rating[np.argmax(counts)]['name']

In [88]:
merged_df['top_rating'] = merged_df.ratings.map(get_top_rating)

In [99]:
merged_df.groupby(['top_rating'])[['url']].count()

Unnamed: 0_level_0,url
top_rating,Unnamed: 1_level_1
Beautiful,142
Confusing,2
Courageous,82
Fascinating,251
Funny,159
Informative,711
Ingenious,101
Inspiring,851
Jaw-dropping,49
Longwinded,8


In [100]:
merged_df.groupby(['topic_pred_id'])[['url']].count()

Unnamed: 0_level_0,url
topic_pred_id,Unnamed: 1_level_1
0,510
1,145
2,240
3,95
4,362
5,518
6,195
7,95
8,200
9,107


In [157]:
# Emulate this with d3
topic_dict = {}
for i in range(len(t_words)):
    tag_dict = {}
    df_filtered = merged_df.loc[merged_df.topic_pred_id == i]
    for tag_list in list(df_filtered.tags):
        for tag in tag_list:
            if tag in tag_dict:
                tag_dict[tag] += 1
            else:
                tag_dict[tag] = 1
    top_ten_tags = heapq.nlargest(10, tag_dict.items(), key=operator.itemgetter(1))
    topic_dict[i] = top_ten_tags

In [194]:
topic_dict

{0: [('culture', 147),
  ('global issues', 110),
  ('TEDx', 104),
  ('society', 85),
  ('entertainment', 81),
  ('humanity', 78),
  ('social change', 77),
  ('identity', 65),
  ('storytelling', 64),
  ('communication', 60)],
 1: [('music', 84),
  ('entertainment', 74),
  ('live music', 47),
  ('performance', 46),
  ('technology', 36),
  ('culture', 32),
  ('creativity', 27),
  ('art', 26),
  ('design', 18),
  ('TEDx', 18)],
 2: [('science', 131),
  ('environment', 80),
  ('animals', 74),
  ('technology', 64),
  ('biology', 62),
  ('oceans', 55),
  ('nature', 53),
  ('exploration', 43),
  ('biodiversity', 43),
  ('global issues', 42)],
 3: [('health', 50),
  ('science', 49),
  ('medicine', 43),
  ('health care', 39),
  ('cancer', 35),
  ('technology', 28),
  ('medical research', 28),
  ('disease', 19),
  ('culture', 17),
  ('illness', 17)],
 4: [('global issues', 190),
  ('business', 118),
  ('economics', 90),
  ('technology', 71),
  ('politics', 61),
  ('culture', 57),
  ('TEDx', 57),


In [94]:
t_words

{0: ['father', 'god', 'war', 'girl', 'parents'],
 1: ['music', 'sound', 'playing', 'sounds', 'audience'],
 2: ['species', 'animals', 'planet', 'sea', 'animal'],
 3: ['cancer', 'disease', 'medical', 'blood', 'hospital'],
 4: ['africa', 'economic', 'companies', 'economy', 'china'],
 5: ['computer', 'machine', 'internet', 'digital', 'computers'],
 6: ['cities', 'car', 'cars', 'street', 'driving'],
 7: ['universe', 'theory', 'sun', 'planet', 'black'],
 8: ['students', 'education', 'learning', 'language', 'schools'],
 9: ['cells', 'cell', 'blood', 'disease', 'lab']}

In [195]:
demo_topic_tag_df

Unnamed: 0,tag,count
0,culture,147
1,global issues,110
2,TEDx,104
3,society,85
4,entertainment,81
5,humanity,78
6,social change,77
7,identity,65
8,storytelling,64
9,communication,60


In [193]:
demo_topic_tag_df = pd.DataFrame(topic_dict[0])
demo_topic_tag_df.columns = ['tag', 'count']
demo_topic_tag_df.to_csv('data/demo_topic_tag.csv')

In [167]:
merged_df.to_csv('data/labelled_data.csv')