# Machine Learning on TED Talk Dataset
* Use TED Talk transcripts as input, and use ratings and training labels. 

## Import Libraries

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Reshape, Bidirectional
from tensorflow.keras import backend as K

import string

## Import Dataset

In [2]:
main_filepath = './ted_main.csv'
transcript_filepath = './transcripts.csv'

In [3]:
main = pd.read_csv(main_filepath)
transcripts = pd.read_csv(transcript_filepath)

In [80]:
main.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [81]:
transcripts.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [85]:
# merge datasets with 'inner join' on urls
dataset = pd.merge(left=main, right=transcripts, left_on='url', right_on='url')

## Analyze Main Dataset Features
* Length of dataset
* Number of Columns

In [88]:
# dataset.head()

In [89]:
columns = dataset.columns

In [90]:
new_columns = ['transcript', 'ratings', 'comments', 'duration', 'name', 'url']
new_dataset = dataset[new_columns]

In [91]:
new_dataset.head()

Unnamed: 0,transcript,ratings,comments,duration,name,url
0,Good morning. How are you?(Laughter)It's been ...,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",4553,1164,Ken Robinson: Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...","[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",265,977,Al Gore: Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...","[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",124,1286,David Pogue: Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",200,1116,Majora Carter: Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...","[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",593,1190,Hans Rosling: The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...


In [92]:
print('Number of TED Talks: ', len(new_dataset))

Number of TED Talks:  2467


## Dataset Preprocessing

In [104]:
# Transcript cleaning

'''
Remove punctuation from a `transcript` string
'''
def clean(transcript): 
    cleaned = transcript.translate(str.maketrans('', '', string.punctuation))
    
    # Temporarily remove `Laughter` from the transcript, but could be used later
    cleaned = cleaned.replace('Laughter', '')
    cleaned = cleaned.replace('Applause', '')
    
    # Remove hyphens 
    cleaned = cleaned.replace('—', '')
    return cleaned

'''
For each transcript in pandas dataset, clean each transcript. 
'''
def clean_transcripts(dataset):
    clean_dataset = dataset.copy()
    for index, ted_talk in clean_dataset.iterrows(): 
#         print(ted_talk)
        transcript = ted_talk[0]
        clean_script = clean(transcript)
        if index == 1:
            print(clean_script)
        clean_dataset.at[index, 'transcript'] = clean_script
    return clean_dataset

In [105]:
clean_dataset = clean_transcripts(new_dataset)

Thank you so much Chris And its truly a great honor to have the opportunity to come to this stage twice Im extremely grateful I have been blown away by this conference and I want to thank all of you for the many nice comments about what I had to say the other night And I say that sincerely partly because Mock sob I need thatPut yourselves in my positionI flew on Air Force Two for eight yearsNow I have to take off my shoes or boots to get on an airplaneIll tell you one quick story to illustrate what thats been like for meIts a true story  every bit of this is trueSoon after Tipper and I left the  Mock sob White House we were driving from our home in Nashville to a little farm we have 50 miles east of Nashville Driving ourselvesI know it sounds like a little thing to you but I looked in the rearview mirror and all of a sudden it just hit me There was no motorcade back thereYouve heard of phantom limb painThis was a rented Ford TaurusIt was dinnertime and we started looking for a place to

In [106]:
clean_dataset.head()

Unnamed: 0,transcript,ratings,comments,duration,name,url
0,Good morning How are youIts been great hasnt i...,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",4553,1164,Ken Robinson: Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...
1,Thank you so much Chris And its truly a great ...,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",265,977,Al Gore: Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...
2,Music The Sound of Silence Simon GarfunkelHel...,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",124,1286,David Pogue: Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...
3,If youre here today and Im very happy that yo...,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",200,1116,Majora Carter: Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...
4,About 10 years ago I took on the task to teach...,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",593,1190,Hans Rosling: The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...


## Word Embeddings

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

[--------------------------------------------------] 0.1% 0.9/1662.8MB downloaded