In [13]:
import pandas as pd
import numpy as np
import scipy.sparse as ss

import pickle
from collections import Counter
from pprint import pprint

import re
import string

import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import spacy
from spacy import displacy
from spacy.symbols import amod
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

import scattertext as st

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pyLDAvis
import pyLDAvis.sklearn


from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import swat
from sklearn.manifold import TSNE

# Get Data

In [14]:
with open('ted_video_transcripts_2416.pkl', 'rb') as picklefile:
    talks = pickle.load(picklefile)

In [103]:
with open('ted_video_stats_2722.pkl', 'rb') as picklefile:
    stats = pickle.load(picklefile)

In [105]:
!pwd

/Users/elena/Desktop/Metis/Project_4_Ted/Project-4-Ted


In [106]:
with open('ted_video_comments_emotions.pkl', 'rb') as picklefile:
    comment = pickle.load(picklefile)

In [16]:
talks.head(2)

Unnamed: 0,video_id,transcript
0,YgAuFqEs6yk,[{'text': 'I remember watching my father raise...
1,bNmRr-BYnxA,[{'text': 'Transcriber: Joseph Geni Reviewer: ...


In [17]:
len(talks)

2416

In [18]:
stats.head(2)

Unnamed: 0,video_id,tags,title,publushed_date,comments_count,dislike_count,like_count,views_count,today,days_age,views_per_day,dislike_perc_of_likes,success,prime_tag
0,Bb7kz1THJPU,"[Science Fiction, Future, Creativity, Society,...","Go ahead, dream about the future | Charlie Jan...",2020-04-20 13:28:47,653,846,2082,71384,2020-05-16,25,2855.36,40.634006,no,Science Fiction
0,cTIUiN6inIQ,"[The Way We Work, work]",How to make faster decisions | The Way We Work...,2020-02-10 15:00:24,98,84,4298,145314,2020-05-16,95,1529.621053,1.954397,yes,The Way We Work


In [19]:
len(stats)

2722

# Text Preprocessing

### Formatting

In [20]:
def clean_transcript(dirty_line):
    output = ''
    for el in dirty_line: #df['transcript'].values[0]
        for key, value in el.items():
            line_output = str.join("", value)
            output += " " 
            output += line_output
    return output

In [21]:
talks['transcript'] = talks['transcript'].map(clean_transcript)

### SpaCy

In [22]:
nlp = spacy.load('en')

In [23]:
len(STOP_WORDS)

326

In [24]:
my_stop_words_list = ['Transcriber', 'Translator', 'reviewer', 'thanks', 'thank', '\n',
                     'applause','music', 'Laughter', 'applaud', 'TED', 'applause', 'TED Talks',
                     'TED workshop', 'Elizabeth',  'Gilbert', 'Ivana', 'Korom','Krystian', 'Aparta', 'Walters',
                     'Joseph','Geni', 'thing', 'think', 'know', 'like', 'way', 'look', 'lot', 'good', 'bad', 'want', 'kind', 
                     'talk', 'word', 'actually', 'hi', 'woman', 'man', 
                      'people', 'say', 'come', 'feel', 'tell','ve', 'day', 'right','think',
                     '']
for word in my_stop_words_list:
    spacy.lang.en.stop_words.STOP_WORDS.add(word)
    nlp.vocab[word].is_stop = True

In [25]:
#STOP_WORDS

In [26]:
persons = []
for i in range(talks.shape[0]):
    persons.append([ent.text for ent in nlp(talks['transcript'][i]).ents if ent.label_ == 'PERSON'])

In [27]:
#persons

In [28]:
persons_flat_list = [item for sublist in persons for item in sublist]

In [29]:
#persons_flat_list

In [30]:
persons_flat_list = str(persons_flat_list).replace("'", '').replace(",", '').replace(":", '').replace("[", '').split()

In [31]:
errors = ['Parenthood', 'Mamas', 'corona','misstep', 'monk', 'moon', 'Paris', 'attach kid', 'Berlin 2005', 
          'butterfly', 'Xbox', 'Uber Lyft', 'Crisis Text Line', 'God', 'Law', 'Moon', 'Sun', 'microphone hiss',
          'sanitary napkin', 'hygiene piece', 'hygiene piece', 'downright macabre','berlin',
          'applaud african', 'diaspora hear', 'politician mirror', 'bradfeld', 'carrier asymptomatic',
          'New Yorker', 'yank mouth', 'metaphor monk', 'Facebook', 'LEGOs lego', 'lot hubris','healthy leafy green',
          'hygiene factor', 'healthy melatonin', 'lover gentle', 'mosquito','Gmail','iCloud Apple',
          'Kaspersky', 'bomb satsang wood', 'kiosk', 'intro', 'sama buona ICU', 'need belong','thousand',
          'snooze button', 'primarily white', 'gnash throat', 'fangirl learn', 'Xbox PlayStation', 'butterfly']

In [32]:
persons_flat_list = persons_flat_list
for i in persons_flat_list:
    if i in errors:
       # print(i)
        persons_flat_list.remove(i)

In [33]:
#persons_flat_list

In [34]:
for word in persons_flat_list:
    spacy.lang.en.stop_words.STOP_WORDS.add(word)
    nlp.vocab[word].is_stop = True

In [35]:
nlp.vocab['woman'].is_stop

True

In [36]:
##########talks = talks.iloc[:50, :]

In [37]:
talks.head(6)

Unnamed: 0,video_id,transcript
0,YgAuFqEs6yk,I remember watching my father raised the pist...
1,bNmRr-BYnxA,Transcriber: Joseph Geni\nReviewer: Camille M...
2,FVUkKKc3Vvk,"Hi, everyone, my name is Elizabeth, and I wor..."
3,8bj0GR34XWc,Transcriber: Ivana Korom\nReviewer: Krystian ...
4,eaCrsBtiYA4,I am a public policy wonk. I investigate data...
5,OMDVTZ-ycaY,Transcriber: Joseph Geni\nReviewer: Joanna Pi...


In [38]:
talks['transcript1'] = talks.transcript.apply(
    lambda text: ' '.join(token.orth_ for token in nlp(text) if not token.is_punct))

In [39]:
talks['transcript2'] = talks.transcript1.apply(
    lambda text: ' '.join(token.lemma_ for token in nlp(text) if not token.is_stop))

In [40]:
talks['transcript2'] = talks['transcript2'].apply(lambda text: re.sub('\n', '', text))

In [41]:
talks['transcript'] = talks['transcript2']

In [42]:
talks.drop(['transcript1', 'transcript2'], inplace=True, axis=1)

In [43]:
talks['transcript'] = talks['transcript'].apply(lambda text: re.sub(r'\b[0-9]+\b\W*', '', text))

### Plarity scores addition 

In [44]:
analyser = SentimentIntensityAnalyzer()
def extract_polarity(text, polarity_type = 'neg'):
    all_val = analyser.polarity_scores(text)
    return all_val[polarity_type]
    

In [45]:
talks['polarity_neg_transcript'] = talks['transcript'].map(extract_polarity)
talks['polarity_pos_transcript'] = talks['transcript'].apply(extract_polarity, polarity_type = 'pos')
talks['polarity_neu_transcript'] = talks['transcript'].apply(extract_polarity, polarity_type = 'neu')
talks['polarity_compound_transcript'] = talks['transcript'].apply(extract_polarity, polarity_type = 'compound')

In [107]:
df1 = pd.merge(talks, stats,  how='left', left_on='video_id', right_on = 'video_id')

In [112]:
df1.head()

Unnamed: 0,video_id,transcript,polarity_neg_transcript,polarity_pos_transcript,polarity_neu_transcript,polarity_compound_transcript,tags,title,publushed_date,comments_count,...,trust,fear,negative,sadness,anger,surprise,positive,disgust,joy,anticipation
0,YgAuFqEs6yk,remember watch father raise pistol plead gu...,0.133,0.33,0.537,0.9996,"[Family, Parenting, Communication, Children, R...","How to co-parent as allies, not adversaries | ...",2020-05-14 14:40:00,109,...,0.007296,0.003192,0.003648,0.002736,0.001824,0.00456,0.019608,0.001368,0.010032,0.008664
1,bNmRr-BYnxA,think give half humanity ' spend week...,0.194,0.267,0.539,0.998,"[climate change, environment, global issues, c...",How to shift your mindset and choose your futu...,2020-05-13 14:22:51,205,...,0.008208,0.00456,0.00684,0.00228,0.003648,0.00228,0.021888,0.00228,0.00456,0.008664
2,FVUkKKc3Vvk,work trading floor ' m pretty new graduate ...,0.111,0.306,0.583,0.9993,"[Life, Society, Immigration, Humanity, Self, P...",What's missing from the American immigrant nar...,2020-05-12 18:06:31,234,...,0.003648,0.001368,0.001824,0.001824,0.000912,0.001824,0.005928,0.0,0.00228,0.005016
3,8bj0GR34XWc,worry pandemic pretty life play absolut...,0.135,0.166,0.699,0.9492,"[global issues, science, collaboration, virus,...",A global pandemic calls for global solutions |...,2020-05-11 15:52:37,236,...,0.004104,0.002736,0.00456,0.002736,0.002736,0.001824,0.00684,0.00228,0.002736,0.00228
4,eaCrsBtiYA4,public policy wonk investigate datum point ...,0.2,0.164,0.636,-0.9904,[],Racism has a cost for everyone | Heather C. Mc...,2020-05-08 18:44:35,542,...,0.004104,0.000912,0.001824,0.001368,0.000912,0.00228,0.008208,0.000456,0.003192,0.003192


In [109]:
df1 = pd.merge(df1, comment,  how='left', left_on='video_id', right_on = 'video_id')

In [110]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2418 entries, 0 to 2417
Data columns (total 34 columns):
video_id                        2418 non-null object
transcript                      2418 non-null object
polarity_neg_transcript         2418 non-null float64
polarity_pos_transcript         2418 non-null float64
polarity_neu_transcript         2418 non-null float64
polarity_compound_transcript    2418 non-null float64
tags                            2408 non-null object
title                           2408 non-null object
publushed_date                  2408 non-null datetime64[ns]
comments_count                  2394 non-null object
dislike_count                   2408 non-null object
like_count                      2408 non-null object
views_count                     2408 non-null object
today                           2408 non-null datetime64[ns]
days_age                        2408 non-null float64
views_per_day                   2408 non-null float64
dislike_perc_of_likes  

In [111]:
with open('JOINT_ted_video_transcripts_comments_stats.pkl', 'wb') as picklefile:
    pickle.dump(df1, picklefile)