### Import the libraries

In [254]:
import pandas as pd
import numpy as np

In [256]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

### Some useful functions to process text

In [257]:
import re
import os
import sys

import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from bs4 import BeautifulSoup
import unicodedata
from textblob import TextBlob

nlp = spacy.load('en_core_web_sm')


def get_wordcounts(x):
    length = len(str(x).split())
    return length


def get_charcounts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)


def get_avg_wordlength(x):
    count = get_charcounts(x) / get_wordcounts(x)
    return count


def get_stopwords_counts(x):
    l = len([t for t in x.split() if t in stopwords])
    return l


def get_hashtag_counts(x):
    l = len([t for t in x.split() if t.startswith('#')])
    return l


def get_mentions_counts(x):
    l = len([t for t in x.split() if t.startswith('@')])
    return l


def get_digit_counts(x):
    return len([t for t in x.split() if t.isdigit()])


def get_uppercase_counts(x):
    return len([t for t in x.split() if t.isupper()])


def cont_exp(x):
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how does",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        " u ": " you ",
        " ur ": " your ",
        " n ": " and ",
        "won't": "would not",
        'dis': 'this',
        'bak': 'back',
        'brng': 'bring'}

    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x


def get_emails(x):
    emails = re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', x)
    counts = len(emails)

    return counts, emails


def remove_emails(x):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', "", x)


def get_urls(x):
    urls = re.findall(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)
    counts = len(urls)

    return counts, urls


def remove_urls(x):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)


def remove_rt(x):
    return re.sub(r'\brt\b', '', x).strip()


def remove_special_chars(x):
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x


def remove_html_tags(x):
    return BeautifulSoup(x, 'lxml').get_text().strip()


def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x


def remove_stopwords(x):
    return ' '.join([t for t in x.split() if t not in stopwords])


def make_base(x):
    x = str(x)
    x_list = []
    doc = nlp(x)

    for token in doc:
        lemma = token.lemma_
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text

        x_list.append(lemma)
    return ' '.join(x_list)


def get_value_counts(df, col):
    text = ' '.join(df[col])
    text = text.split()
    freq = pd.Series(text).value_counts()
    return freq


def remove_common_words(x, freq, n=20):
    fn = freq[:n]
    x = ' '.join([t for t in x.split() if t not in fn])
    return x


def remove_rarewords(x, freq, n=20):
    fn = freq.tail(n)
    x = ' '.join([t for t in x.split() if t not in fn])
    return x


def spelling_correction(x):
    x = TextBlob(x).correct()
    return x


### Load the dataset

In [353]:
df_all = pd.read_csv('ted_talks_en.csv')

In [354]:
df_all.head()

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world’...,3523392,2006-02-25,2006-06-27,TED2006,en,"['ar', 'bg', 'cs', 'de', 'el', 'en', 'es', 'fa...",272.0,977,"['alternative energy', 'cars', 'climate change...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
1,92,The best stats you've ever seen,Hans Rosling,{0: 'Hans Rosling'},{0: ['global health expert; data visionary']},"{0: 'In Hans Rosling’s hands, data sings. Glob...",14501685,2006-02-22,2006-06-27,TED2006,en,"['ar', 'az', 'bg', 'bn', 'bs', 'cs', 'da', 'de...",628.0,1190,"['Africa', 'Asia', 'Google', 'demo', 'economic...","{2056: ""Own your body's data"", 2296: 'A visual...",https://www.ted.com/talks/hans_rosling_the_bes...,You've never seen data presented like this. Wi...,"About 10 years ago, I took on the task to teac..."
2,7,Simplicity sells,David Pogue,{0: 'David Pogue'},{0: ['technology columnist']},{0: 'David Pogue is the personal technology co...,1920832,2006-02-24,2006-06-27,TED2006,en,"['ar', 'bg', 'de', 'el', 'en', 'es', 'fa', 'fr...",124.0,1286,"['computers', 'entertainment', 'interface desi...","{1725: '10 top time-saving tech tips', 2274: '...",https://www.ted.com/talks/david_pogue_simplici...,New York Times columnist David Pogue takes aim...,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,53,Greening the ghetto,Majora Carter,{0: 'Majora Carter'},{0: ['activist for environmental justice']},{0: 'Majora Carter redefined the field of envi...,2664069,2006-02-26,2006-06-27,TED2006,en,"['ar', 'bg', 'bn', 'ca', 'cs', 'de', 'en', 'es...",219.0,1116,"['MacArthur grant', 'activism', 'business', 'c...",{1041: '3 stories of local eco-entrepreneurshi...,https://www.ted.com/talks/majora_carter_greeni...,"In an emotionally charged talk, MacArthur-winn...",If you're here today — and I'm very happy that...
4,66,Do schools kill creativity?,Sir Ken Robinson,{0: 'Sir Ken Robinson'},"{0: ['author', 'educator']}","{0: ""Creativity expert Sir Ken Robinson challe...",65051954,2006-02-25,2006-06-27,TED2006,en,"['af', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs...",4931.0,1164,"['children', 'creativity', 'culture', 'dance',...","{865: 'Bring on the learning revolution!', 173...",https://www.ted.com/talks/sir_ken_robinson_do_...,Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...


In [355]:
df = df_all[['transcript', 'topics']].copy()

In [356]:
df.head()

Unnamed: 0,transcript,topics
0,"Thank you so much, Chris. And it's truly a gre...","['alternative energy', 'cars', 'climate change..."
1,"About 10 years ago, I took on the task to teac...","['Africa', 'Asia', 'Google', 'demo', 'economic..."
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...","['computers', 'entertainment', 'interface desi..."
3,If you're here today — and I'm very happy that...,"['MacArthur grant', 'activism', 'business', 'c..."
4,Good morning. How are you? (Audience) Good. It...,"['children', 'creativity', 'culture', 'dance',..."


#### Convert string in the topics column to list

In [357]:
df.loc[0, 'topics']

"['alternative energy', 'cars', 'climate change', 'culture', 'environment', 'global issues', 'science', 'sustainability', 'technology']"

In [358]:
import  ast

In [359]:
ast.literal_eval(df.loc[0, 'topics'])

['alternative energy',
 'cars',
 'climate change',
 'culture',
 'environment',
 'global issues',
 'science',
 'sustainability',
 'technology']

In [360]:
df.loc[:, 'topics'] = df.loc[:, 'topics'].apply(lambda x: ast.literal_eval(x))

In [361]:
df.loc[0, 'topics']

['alternative energy',
 'cars',
 'climate change',
 'culture',
 'environment',
 'global issues',
 'science',
 'sustainability',
 'technology']

### Remove unwanted stuff from the transcripts

In [362]:
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_emails(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_urls(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_special_chars(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_accented_chars(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_stopwords(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: remove_html_tags(x))
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: cont_exp(x.lower()))
# df.loc['transcript'] = df.loc['transcript'].apply(lambda x: spelling_correction(x.lower()))

### Lemmatization

In [363]:
df.loc[:,'transcript'] = df.loc[:,'transcript'].apply(lambda x: make_base(x))

In [364]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4005 entries, 0 to 4004
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  4005 non-null   object
 1   topics      4005 non-null   object
dtypes: object(2)
memory usage: 62.7+ KB


### Convert topics list to multi column binary values

In [365]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df["topics"])

In [366]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [367]:
y.shape

(4005, 457)

### Get the binarized dataframe

In [368]:
classes  = multilabel.classes_

In [369]:
bin_topics_df = pd.DataFrame(y, columns=classes)

In [370]:
bin_topics_df

Unnamed: 0,3D printing,AI,AIDS,Africa,Alzheimer's,Antarctica,Anthropocene,Asia,Audacious Project,Autism spectrum disorder,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4003,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Retain only the popular and meaningful topics

In [371]:
list(bin_topics_df.sum(axis=0).sort_values(ascending=False).head(30).index)

['science',
 'technology',
 'culture',
 'TEDx',
 'global issues',
 'TED-Ed',
 'society',
 'design',
 'social change',
 'animation',
 'business',
 'health',
 'history',
 'education',
 'humanity',
 'innovation',
 'biology',
 'entertainment',
 'future',
 'art',
 'communication',
 'creativity',
 'community',
 'activism',
 'medicine',
 'brain',
 'collaboration',
 'personal growth',
 'environment',
 'economics']

In [372]:
popular_topics = ['science',
 'technology',
 'culture',
 'global issues',
 'society',
 'social change',
 'business',
 'health',
 'history',
 'education',
 'humanity',
 'innovation',
 'biology',
 'entertainment',
 'future',
 'art',
 'communication',
 'creativity',
 'medicine',
 'personal growth',
 'environment',
 'economics']

In [373]:
popular_topics_df = bin_topics_df.loc[:,popular_topics]

In [374]:
popular_topics_df

Unnamed: 0,science,technology,culture,global issues,society,social change,business,health,history,education,...,biology,entertainment,future,art,communication,creativity,medicine,personal growth,environment,economics
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4001,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4002,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4003,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


### Attach the popular DF to the main DF

In [375]:
df = pd.concat([df,popular_topics_df], axis=1)

In [376]:
df.loc[:,'science'].sum()

993

### Remove the rows that have no topics

In [380]:
df = df.loc[~(df[list(popular_topics_df)] == 0).all(axis=1)]

### Convert the transcripts to Tf-Idf vectors

In [382]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=4000)

In [383]:
X= tfidf.fit_transform(df['transcript'])

In [384]:
y = df.loc[:, list(popular_topics_df)]

In [385]:
y

Unnamed: 0,science,technology,culture,global issues,society,social change,business,health,history,education,...,biology,entertainment,future,art,communication,creativity,medicine,personal growth,environment,economics
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4001,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4002,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4003,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


### Split the data into training and test sets

In [489]:
X.shape, y.shape

((3719, 4000), (3719, 22))

In [490]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 0)

In [491]:
X_train.shape, X_test.shape

((3347, 4000), (372, 4000))

In [492]:
#### Since this is a multilabel classifier we need to import the appropriate libraries

In [493]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score

### Train using Logistic regression

In [507]:
lr = LogisticRegression()

In [508]:
clf = OneVsRestClassifier(lr)

In [509]:
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [510]:
y_pred = clf.predict(X_test)

In [511]:
#[None, 'micro', 'macro', 'weighted', 'samples']
jaccard_score(y_test, y_pred, average='samples')

0.260160650281618

### Test with our random sample document

In [512]:
test_transcript = tfidf.transform(['''the basic technical idea behind deep learning in your 
                                    networks have been around for decades why are they only just now taking off in this video let's go 
                                    over some of the main drivers behind the rise of deep learning because I think this will help you 
                                    that the spot the best opportunities within your own organization'''])
filters = list(clf.predict(test_transcript)[0] == 1)
pred_topics = [i for (i, v) in zip(popular_topics, filters) if v] 
pred_topics

[]

## :( Accuracy not satisfactory

--------

### Let's see if Support vector machine helps

In [523]:
from sklearn.svm import LinearSVC

In [532]:
svm = LinearSVC()
clf = OneVsRestClassifier(svm)
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=LinearSVC())

In [533]:
y_pred = clf.predict(X_test)

In [534]:
#[None, 'micro', 'macro', 'weighted', 'samples']
jaccard_score(y_test, y_pred, average='samples')

0.34471646185355864

In [535]:
test_transcript = tfidf.transform(['''the basic technical idea behind deep learning in your 
                                    networks have been around for decades why are they only just now taking off in this video let's go 
                                    over some of the main drivers behind the rise of deep learning because I think this will help you 
                                    that the spot the best opportunities within your own organization'''])
filters = list(clf.predict(test_transcript)[0] == 1)
pred_topics = [i for (i, v) in zip(popular_topics, filters) if v] 
pred_topics

['education']