# What's in this notebook?
Here we'll do some preprocessing so that we can do some NLP (tokenizing & LDA), then visualize our results.


In [265]:
import pickle
import pandas as pd
import numpy as np

In [267]:
with open('transcripts_df.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df1 = pickle.load(f)

with open('additional_data_df.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df2 = pickle.load(f)

In [274]:
df = pd.concat([df1, pd.DataFrame(df2)])
df.index = range(len(df))

In [314]:
# dropping poorly scrapped transcripts from graduationwisdom.com
df.drop(df[(df.transcript.str.len() < 3300) & (df.index > 786) & (df.index < 1007)].index, inplace=True)

In [315]:
df

Unnamed: 0,source,transcript
0,https://www.youtube.com/watch?v=bPv21OyQLkM,I'm pleased to welcome to the platform miss Ca...
1,https://www.youtube.com/watch?v=ngzIkKtjT6o,author Tom Wolfe addressed the graduating clas...
2,https://www.youtube.com/watch?v=y5YvCbOmNxQ,ladies and gentlemen dr. Fred Rogers Wow it's ...
3,https://www.youtube.com/watch?v=Q34H3As2QJA,I'd like to tell you too true stories evening ...
4,https://www.youtube.com/watch?v=abo-YcLrnao,thank you and I'm a doctor so pay attention fo...
5,https://www.youtube.com/watch?v=jsHBI4w4FgQ,it is now my great pleasure to recognize today...
6,https://www.youtube.com/watch?v=9ZDuNzhelhQ,Will the graduates to the degree of Bachelor\n...
7,https://www.youtube.com/watch?v=9LheUWrXUHU,is now my honor and privilege to present to yo...
8,https://www.youtube.com/watch?v=TvI69F8uf1A,traditionally Yale does not have a commencemen...
9,https://www.youtube.com/watch?v=-vsLRM4Tjd0,I asked if I got to keep the Hat they tell me ...


## Preprocessing
Removing punctuation, converting to lowercase, stop words, etc.

In [316]:
# while looking through the data, I found one of the transcripts is completely in Russian -- whoops!
# I'll see if I can explicitly specify the english youtube transcript from the youtube transcript api
from youtube_transcript_api import YouTubeTranscriptApi

df.iloc[172].transcript = ' '.join([x['text'] for x in 
                                    YouTubeTranscriptApi.get_transcript(df.iloc[172].source[-11:],
                                                                        languages=['en'])
                                   ])

In [137]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [317]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import spacy


def preprocess(text):
    # remove punctuation
    punctuation = ['.', ',', '?', '/', "'", '"', 
                  ':', ';', ')', '(', '*', '&',
                  '^', '%', '$', '#', '!', '`', 
                   '-', '_', '[', ']', '<', '>',
                  '\n', '\r', '’', '~', '|']
    for punc in punctuation:
        text = text.replace(punc, ' ')
        
    # replace years with 'year'
    text = re.sub(r'(19|20)\d{2}\s', 'year ', text)
        
    # lowercase
    text = text.lower()
    
    # remove stopwords
    # adding 'applause' to nltk's list, since it
    # occurs a lot & is unhelpful 
    wordsFiltered = '' # collect non-stop-words
    stopWords = set(stopwords.words('english'))
    stopWords.add('applause')
    for w in word_tokenize(text):
        if w not in stopWords:
            wordsFiltered += w + ' '
    
    # lemmatize  
    # nltk's lemmatizer was garbage 
    # lemmatizer = WordNetLemmatizer() 
    # lemmatized = [lemmatizer.lemmatize(word) for word in wordsFiltered] 

    # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
    sp = spacy.load('en')

    # Parse the sentence using the loaded 'en' model object `nlp`
    lemmatized = sp(wordsFiltered)
    
    return [word.lemma_ for word in lemmatized]

In [318]:
# takes a loooooooooooooong time
df['processed'] = df.transcript.apply(lambda x: preprocess(x) if len(x) < 100000 else 'NEED TO PREPROCESS')

In [319]:
df

Unnamed: 0,source,transcript,processed
0,https://www.youtube.com/watch?v=bPv21OyQLkM,I'm pleased to welcome to the platform miss Ca...,"[pleased, welcome, platform, miss, carlton, fi..."
1,https://www.youtube.com/watch?v=ngzIkKtjT6o,author Tom Wolfe addressed the graduating clas...,"[author, tom, wolfe, address, graduate, class,..."
2,https://www.youtube.com/watch?v=y5YvCbOmNxQ,ladies and gentlemen dr. Fred Rogers Wow it's ...,"[lady, gentlemen, dr, fred, rogers, wow, beaut..."
3,https://www.youtube.com/watch?v=Q34H3As2QJA,I'd like to tell you too true stories evening ...,"[like, tell, true, story, evening, together, m..."
4,https://www.youtube.com/watch?v=abo-YcLrnao,thank you and I'm a doctor so pay attention fo...,"[thank, doctor, pay, attention, cry, loud, tha..."
5,https://www.youtube.com/watch?v=jsHBI4w4FgQ,it is now my great pleasure to recognize today...,"[great, pleasure, recognize, today, commenceme..."
6,https://www.youtube.com/watch?v=9ZDuNzhelhQ,Will the graduates to the degree of Bachelor\n...,"[graduate, degree, bachelor, art, please, rise..."
7,https://www.youtube.com/watch?v=9LheUWrXUHU,is now my honor and privilege to present to yo...,"[honor, privilege, present, william, jefferson..."
8,https://www.youtube.com/watch?v=TvI69F8uf1A,traditionally Yale does not have a commencemen...,"[traditionally, yale, commencement, speaker, s..."
9,https://www.youtube.com/watch?v=-vsLRM4Tjd0,I asked if I got to keep the Hat they tell me ...,"[ask, get, keep, hat, tell, thank, president, ..."


Let's pickle this since it took so long!

In [320]:
with open('processed_transcripts.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)

## Vectorizing
We'll start with count vectorizing, and then we'll take a look at word2vec.

In [321]:
from sklearn.feature_extraction.text import CountVectorizer

In [322]:
vectorizer = CountVectorizer(ngram_range=(1,6))
X = vectorizer.fit_transform(df.processed.apply(lambda x: ' '.join([word for word in x])))

<1129x5459835 sparse matrix of type '<class 'numpy.int64'>'
	with 6995694 stored elements in Compressed Sparse Row format>