# Ted Talk Topic Extraction with NMF

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

In [90]:
transcript_df = pd.read_csv('data/transcripts.csv')
main_df = pd.read_csv('data/ted_main.csv', parse_dates=['film_date', 'published_date'],
                      converters={'ratings': literal_eval, 'tags': literal_eval, 'related_talks': literal_eval})

In [96]:
#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [92]:
# parameters to play with
MIN_DF = 0.1
MAX_DF = 0.2
NUM_TOPICS = 10
MORE_STOPWORDS = set(['talk', 'applause', 'laughter', 'yeah', 'oh', 'guy'])

In [93]:
min_grid = np.arange(0.1,1.0,0.1)
max_grid = np.arange(0.1,1.0,0.1)
topics_grid = [5, 7, 9, 11]

In [94]:
en_stop = en_stop | MORE_STOPWORDS
vectorizer = TfidfVectorizer(stop_words=en_stop,
                             use_idf=True,
                             ngram_range=(1, 1),
                             min_df=MIN_DF,
                             max_df=MAX_DF)

tfidf = vectorizer.fit_transform(transcript_df['transcript'])

# Fit NMF model

In [95]:
nmf = NMF(n_components=NUM_TOPICS,
          random_state=42)

topics = nmf.fit_transform(tfidf)
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(nmf.components_):
    t_words[t_id] = [vectorizer.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
for k, v in t_words.items():
    print(k,v)

0 ['father', 'god', 'war', 'girl', 'parents']
1 ['music', 'sound', 'playing', 'sounds', 'audience']
2 ['species', 'animals', 'planet', 'sea', 'animal']
3 ['cancer', 'disease', 'medical', 'blood', 'hospital']
4 ['africa', 'economic', 'companies', 'economy', 'china']
5 ['computer', 'machine', 'internet', 'digital', 'computers']
6 ['cities', 'car', 'cars', 'street', 'driving']
7 ['universe', 'theory', 'sun', 'planet', 'black']
8 ['students', 'education', 'learning', 'language', 'schools']
9 ['cells', 'cell', 'blood', 'disease', 'lab']


# Testing out NMF fit on 5 randomly sampled talks

In [97]:
pipe = Pipeline([
    ('tfidf', vectorizer),
    ('nmf', nmf)
])

rand_doc_ids = np.random.randint(0,len(transcript_df),5)

for doc_id in rand_doc_ids:
    t = pipe.transform([transcript_df['transcript'].iloc[doc_id]]) 
    print('Predicted most likely topic: ', t_words[np.argmax(t)])
    talk_data = main_df[main_df.url == transcript_df.iloc[doc_id]['url']]
    print('Talk Title:', talk_data.title.values)
    print('Actual Talk Tags:', talk_data.tags.values)
    print('----------------------------------------------------')

Predicted most likely topic:  ['computer', 'machine', 'internet', 'digital', 'computers']
Talk Title: ['Meet Rezero, the dancing ballbot']
Actual Talk Tags: [list(['creativity', 'design', 'engineering', 'robots', 'technology'])]
----------------------------------------------------
Predicted most likely topic:  ['father', 'god', 'war', 'girl', 'parents']
Talk Title: ['Military robots and the future of war']
Actual Talk Tags: [list(['design', 'drones', 'future', 'global issues', 'robots', 'technology', 'violence', 'war'])]
----------------------------------------------------
Predicted most likely topic:  ['universe', 'theory', 'sun', 'planet', 'black']
Talk Title: ['How to air-condition outdoor spaces']
Actual Talk Tags: [list(['TEDx', 'alternative energy', 'architecture', 'design', 'ecology', 'energy', 'engineering', 'entertainment', 'environment', 'global issues', 'industrial design', 'infrastructure', 'innovation', 'invention', 'public spaces', 'science', 'solar energy', 'sports', 'su

# applying topic extraction to every talk

In [98]:
def get_topic(transcript):
    t = pipe.transform([transcript])
    topic = t_words[np.argmax(t)]
    return topic

In [99]:
transcript_df['topic_pred'] = transcript_df.transcript.map(get_topic)

In [107]:
merged_df = main_df.merge(transcript_df, on='url')
merged_df.drop('transcript', axis=1, inplace=True)

In [112]:
merged_df[['title', 'description', 'topic_pred']][:10]

Unnamed: 0,title,description,topic_pred
0,Do schools kill creativity?,Sir Ken Robinson makes an entertaining and pro...,"[students, education, learning, language, scho..."
1,Averting the climate crisis,With the same humor and humanity he exuded in ...,"[cities, car, cars, street, driving]"
2,Simplicity sells,New York Times columnist David Pogue takes aim...,"[computer, machine, internet, digital, computers]"
3,Greening the ghetto,"In an emotionally charged talk, MacArthur-winn...","[africa, economic, companies, economy, china]"
4,The best stats you've ever seen,You've never seen data presented like this. Wi...,"[africa, economic, companies, economy, china]"
5,Why we do what we do,"Tony Robbins discusses the ""invisible forces"" ...","[computer, machine, internet, digital, computers]"
6,Letting go of God,When two young Mormon missionaries knock on Ju...,"[father, god, war, girl, parents]"
7,Behind the design of Seattle's library,Architect Joshua Prince-Ramus takes the audien...,"[computer, machine, internet, digital, computers]"
8,Let's teach religion -- all religion -- in sch...,Philosopher Dan Dennett calls for religion -- ...,"[father, god, war, girl, parents]"
9,A life of purpose,"Pastor Rick Warren, author of ""The Purpose-Dri...","[father, god, war, girl, parents]"


In [113]:
merged_df.to_csv('labelled_data.csv')