In [1]:
# Loading Packages
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from timeit import default_timer as timer
import pandas as pd
import numpy as np
import requests
import re

from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
import csv
import os
from os.path import exists

## Data Ingestion

In [2]:
# Secret keys and authorization
CLIENT_SECRETS_FILE = "client_secret.json"
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
DEVELOPER_KEY = "AIzaSyBAmYHKpB-g14rlihoODKApxs4CiE0iy9w"

In [3]:
def get_authenticated_service():
    #flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    #credentials = flow.run_console()
    print("Authenticating")
    return build(API_SERVICE_NAME, API_VERSION, developerKey = DEVELOPER_KEY)

In [4]:
def get_authenticated_service():
    #flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    #credentials = flow.run_console()
    print("Authenticating")
    return build(API_SERVICE_NAME, API_VERSION, developerKey = DEVELOPER_KEY)

def comments_list(service, part, parent_id):
    results = service.comments().list(
    parentId=parent_id,
    part=part
  ).execute()

    return results

def get_video_comments(service, channel_id, videoId, link, **kwargs):
    comments = []

    videoResult = service.videos().list(part='snippet,statistics', id=videoId).execute()
    
    # Getting Video Data
    for itemVideo in videoResult['items']:
        print(itemVideo)
        videoTitle = itemVideo['snippet']['title']
        videoTime = itemVideo['snippet']['publishedAt']
        #totalComments = itemVideo['statistics']['commentCount']
    
    try:
      results = service.commentThreads().list(videoId = videoId, **kwargs).execute()
    except (HttpError):
      return [], videoTitle, videoTime, 1

    # Check if comments are dissabled
    #if totalComments == 0:
    #  print('no comments for video: ' + videoTitle)
    #  return [], videoTitle, videoTime, 1

    # Flags are used for a different implementation that updates data instead of pulling fresh, can be ignored
    firstCommentFlag = 0 #this is a flag to determine if the comment is the first
    no_existing_data_flag = 0 #If no prior data exists do not run reply retrieval

    while results:
        for item in results['items']:
                
            linkToComment = link + item['id'] #Create Comment link
            
            try:
                #Creating the comment dictionary
                comment = {
                    # Video info
                    'videoTitle': videoTitle,
                    'videoTimePosted': videoTime,
                    'videoID': item['snippet']['topLevelComment']['snippet']['videoId'],
                    
                    # Author info
                    'authorDisplayName': item['snippet']['topLevelComment']['snippet']['authorDisplayName'],
                    'authorProfileImageUrl': item['snippet']['topLevelComment']['snippet']['authorProfileImageUrl'],
                    'authorChannelUrl':  item['snippet']['topLevelComment']['snippet']['authorChannelUrl'],
                    'authorID': item['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
                    
                    # Comment Info
                    'commentID': item['snippet']['topLevelComment']['id'],
                    'linkToComment': linkToComment,
                    'textDisplay': item['snippet']['topLevelComment']['snippet']['textDisplay'],
                    'parentID': None,
                    'viewerRating': item['snippet']['topLevelComment']['snippet']['viewerRating'],
                    'likeCount': item['snippet']['topLevelComment']['snippet']['likeCount'],
                    'replyCount': item['snippet']['totalReplyCount'],
                    'publishedAt': item['snippet']['topLevelComment']['snippet']['publishedAt'],
                    'isReply': False
                }

                comments.append(comment)

                # Check if comment contains replies
                replyValue = int(item['snippet']['totalReplyCount'])
                if replyValue > 0:
                    #if it contains replies, pull those replies as a comment type
                    replyThread = comments_list(service, part='id,snippet', parent_id=item['id'])
                    for reply in replyThread['items']:
                        linkToCommentReply = link + reply['id']
                        commentReply = {
                                        # Video info
                                        'videoTitle': videoTitle,
                                        'videoTimePosted': videoTime,
                                        'videoID': item['snippet']['topLevelComment']['snippet']['videoId'],
                                        
                                        # Author info
                                        'authorDisplayName': reply['snippet']['authorDisplayName'],
                                        'authorProfileImageUrl': reply['snippet']['authorProfileImageUrl'],
                                        'authorChannelUrl':  reply['snippet']['authorChannelUrl'],
                                        'authorID': reply['snippet']['authorChannelId']['value'],
                                        
                                        # Comment Info
                                        'commentID': reply['id'],
                                        'linkToComment': linkToCommentReply,
                                        'textDisplay': reply['snippet']['textDisplay'],
                                        'parentID': reply['snippet']['parentId'],
                                        'viewerRating': reply['snippet']['viewerRating'],
                                        'likeCount': reply['snippet']['likeCount'],
                                        'replyCount': None,
                                        'publishedAt': reply['snippet']['publishedAt'],
                                        'isReply': True
                                    }
                        comments.append(commentReply)
            
            except KeyError:
                print(linkToComment)

        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            try:
                results = service.commentThreads().list(videoId = videoId, **kwargs).execute()
            except:
                break
        else:
            break
    print('wrote')
    return comments, videoTitle, videoTime, no_existing_data_flag

def get_playlist(service, numberVids, **kwargs):
    videoPlaylist = service.channels().list(**kwargs).execute()
    videoListCurrent = []
    
    print("getting playlist")
    for playlists in videoPlaylist['items']:
        uploadID = playlists['contentDetails']['relatedPlaylists']['uploads']
        #print(uploadID)

    
    getVideos = service.playlistItems().list(part="snippet,contentDetails", playlistId = uploadID, maxResults = numberVids).execute()
    #print("getting videos")
    for uploads in getVideos['items']:
        videoGrab = uploads['contentDetails']['videoId']
        videoListCurrent.append(videoGrab)

    print(videoListCurrent)
    return videoListCurrent
    
def load_data(channel_id, numberVids):
    # When running locally, disable OAuthlib's HTTPs verification. When
    # running in production *do not* leave this option enabled.
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
    service = get_authenticated_service()


    videoTitle = ""
    videoList = get_playlist(service, numberVids, part="snippet, contentDetails", id=channel_id)

    counter = 0
    final_result = pd.DataFrame()
    for videoId in videoList:

        
        # videos = get_playlist(service, part='snippet', channelId=channelId,maxResults=25, textFormat='plainText')
        # videoId = "3rC76KaH4os"
        maxres = 100
        link = "https://www.youtube.com/watch?v=" + videoId + "&lc="
      
        comments, videoTitle, videoTime, no_existing_data_flag = get_video_comments(service, order="time", channel_id = channel_id, link = link, part='snippet', videoId=videoId, maxResults=maxres, textFormat='plainText')
        final_result = final_result.append(pd.DataFrame(comments), ignore_index=True)

    return final_result

def list_video_titles(load_data_list):
    return load_data_list['videoTitle'][0] 

def removeSpecial(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [5]:
patrickjmt_channelId = "UCFe6jenM1Bc54qtBsIJGRZQ"

In [6]:
service = get_authenticated_service()

Authenticating


In [7]:
video_frame = load_data(patrickjmt_channelId, 20)

Authenticating
getting playlist
['8ioTJQkiPO0', 'ZN6IH36ubFs', '7HxrwOWbiiM', '1tEZReWKCq4', 'UZQj79ktGBU', '1TjIUi3s5Ak', '4EmVy-iVjAI', 'umZ6LH4fyZU', 'L1HXk4Y1hqw', 'RDrtq7U4NR4', 'D8h2VaAIJWM', 'aVEmkjDRqe0', 'D0DeY0yoTIA', 'vpYLpgkTtzg', 'VVF45n5URC4', 'XfHHYi_Pwpc', 'dsv2cEgquiA', 'qAVPALkrcvw', 'FelSLO6zYSM', 'ERIfBN-zePQ']
{'kind': 'youtube#video', 'etag': 'b9G5eriByb9KFrCC8i0MrhYkM5I', 'id': '8ioTJQkiPO0', 'snippet': {'publishedAt': '2020-10-25T21:59:08Z', 'channelId': 'UCFe6jenM1Bc54qtBsIJGRZQ', 'title': '2018 AP Physics 1 Free Response Question #1', 'description': '2018 AP Physics 1 Free Response Question #1\nThanks to Jacob Bowman for making this video for my channel!', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/hqdefault.jpg', 'width': 480, 'height': 360}

In [8]:
# #Joey: I added this line to save file to my device, feel free to change path to get your own file

# video_frame.to_csv(r'C:\Users\joeys\OneDrive\ENPH454\Comment Classification\video_frame_1.csv', index=False, header=True)

In [9]:
video_frame.head(100)

Unnamed: 0,videoTitle,videoTimePosted,videoID,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorID,commentID,linkToComment,textDisplay,parentID,viewerRating,likeCount,replyCount,publishedAt,isReply
0,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Lamiya Mekkaoui,https://yt3.ggpht.com/ytc/AAUvwng2-AU3f2QZLQ71...,http://www.youtube.com/channel/UC2yUsztcJ9_Byg...,UC2yUsztcJ9_Byg0rLE9MAkA,UgwdW7n32_pE0GGKtPB4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Hi Patrick. How do I access the videos on your...,,none,0,0.0,2020-12-27T11:44:16Z,False
1,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,ANONY MOUS,https://yt3.ggpht.com/ytc/AAUvwnjg0qVgZ2VzlQ63...,http://www.youtube.com/channel/UC-CJaXwjYcCsLr...,UC-CJaXwjYcCsLrOZBl2i3XA,Ugz8qYewlWYw3sUhV6V4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,A challenge to u once try indian jeeadvanced p...,,none,0,0.0,2020-12-24T14:19:12Z,False
2,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Nippleton University,https://yt3.ggpht.com/ytc/AAUvwnjfOebInlBBxahT...,http://www.youtube.com/channel/UCHgzVJrX4zWYwX...,UCHgzVJrX4zWYwXgT22HSXog,UgxU3i3dKnLfgFCy5ZJ4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,You made it look easy. I approve,,none,0,0.0,2020-12-15T04:39:42Z,False
3,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Winter Morii,https://yt3.ggpht.com/ytc/AAUvwnib-_Ycm_m3i3m3...,http://www.youtube.com/channel/UCdtFxTXQaOaz_m...,UCdtFxTXQaOaz_muuPzXcW5g,Ugx8rSMDoaQH7gGEo994AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Ur voice has changeee alot,,none,0,0.0,2020-12-14T10:28:31Z,False
4,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Ashleigh Etienne,https://yt3.ggpht.com/ytc/AAUvwniN7TgHNWGoWXa2...,http://www.youtube.com/channel/UCjr1y_8pKROvqz...,UCjr1y_8pKROvqzJmo81C0zw,UgyHFahJY2-QVkV8NgF4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Just wanted to tell you that your videos helpe...,,none,0,0.0,2020-12-08T15:28:53Z,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Electrostatic Force from a Charged Rod on a Co...,2020-09-20T11:59:37Z,1tEZReWKCq4,Jorge Saenz,https://yt3.ggpht.com/ytc/AAUvwnjdMR0idjRO_SmG...,http://www.youtube.com/channel/UC4t_1_HkOf3WvM...,UC4t_1_HkOf3WvMIpBRFCjBg,UgycoWsfo-Yt510RJGB4AaABAg,https://www.youtube.com/watch?v=1tEZReWKCq4&lc...,is it just me or has his voice changed from th...,,none,2,3.0,2020-09-24T04:35:58Z,False
96,Electrostatic Force from a Charged Rod on a Co...,2020-09-20T11:59:37Z,1tEZReWKCq4,Sudu Cuber,https://yt3.ggpht.com/ytc/AAUvwngH0M9qHEjsWN0j...,http://www.youtube.com/channel/UC2bXrp8ptot-zF...,UC2bXrp8ptot-zFWHYsp6lqg,UgycoWsfo-Yt510RJGB4AaABAg.9DytnOKIOot9DzSrMSh8L5,https://www.youtube.com/watch?v=1tEZReWKCq4&lc...,well that explains a lot,UgycoWsfo-Yt510RJGB4AaABAg,none,0,,2020-09-24T09:51:04Z,True
97,Electrostatic Force from a Charged Rod on a Co...,2020-09-20T11:59:37Z,1tEZReWKCq4,patrickJMT,https://yt3.ggpht.com/ytc/AAUvwngq2XjoaykTqb9r...,http://www.youtube.com/channel/UCFe6jenM1Bc54q...,UCFe6jenM1Bc54qtBsIJGRZQ,UgycoWsfo-Yt510RJGB4AaABAg.9DytnOKIOot9DzSkmRN8D1,https://www.youtube.com/watch?v=1tEZReWKCq4&lc...,pssst: read the video description,UgycoWsfo-Yt510RJGB4AaABAg,none,2,,2020-09-24T09:50:11Z,True
98,Electrostatic Force from a Charged Rod on a Co...,2020-09-20T11:59:37Z,1tEZReWKCq4,Sudu Cuber,https://yt3.ggpht.com/ytc/AAUvwngH0M9qHEjsWN0j...,http://www.youtube.com/channel/UC2bXrp8ptot-zF...,UC2bXrp8ptot-zFWHYsp6lqg,UgycoWsfo-Yt510RJGB4AaABAg.9DytnOKIOot9DzBkKrtsUm,https://www.youtube.com/watch?v=1tEZReWKCq4&lc...,I literally just came from an 11 year old vide...,UgycoWsfo-Yt510RJGB4AaABAg,none,1,,2020-09-24T07:21:34Z,True


In [8]:
comment_frame = video_frame.loc[:,['videoTitle','textDisplay','likeCount','replyCount']]

In [9]:
for i in comment_frame.index:
    print(comment_frame.loc[i,'textDisplay']+"\n")

Hi Patrick. How do I access the videos on your website? I am a patron already but I’m not sure if I need to pay separately for that? Please let me know as I have 1305039 lots of work to get through! 😊

A challenge to u once try indian jeeadvanced physics and math

You made it look easy. I approve

Ur voice has changeee alot

Just wanted to tell you that your videos helped me so much while I was in college! You are part of the reason I graduated with my Industrial Engineering degree about 7 years ago! Thank you!!

Solve question of IIT JEE question paper

Like si estas haciendo las tareas y te aburres viendo los comentarios :3

Sir its been long you posted. We have been waiting for you to post. You have about 1.5million subscribers and we are eagerly waiting for you to upload a new video. Who else has been waiting. Check my channel friends

mans been doing math videos for 12 years

How can I message you directly?

welcome back

Hello

thanks check out my new tutorial videos

Nice

u did

## Data PreProcessing

In [10]:
! pip install emojis

Collecting emojis
  Downloading https://files.pythonhosted.org/packages/2e/94/61025e53488acd95b49862ec854e05b036f92fe9d0e512ca551a5a8b03d6/emojis-0.6.0-py3-none-any.whl
Installing collected packages: emojis
Successfully installed emojis-0.6.0


In [11]:
import emojis
comment_frame.textDisplay = comment_frame.textDisplay.apply(lambda x: emojis.decode(x).replace(':', ' ').replace('_', ' '))

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# imports
from bs4 import BeautifulSoup
import unicodedata
# from contractions import CONTRACTION_MAP # from contractions.py
import re 
import string
import nltk
import spacy
nlp = spacy.load('en',parse=True,tag=True, entity=True)
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [14]:
# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

In [15]:
# function to expand contractions
def expand_contractions(text, map=CONTRACTION_MAP):
    pattern = re.compile('({})'.format('|'.join(map.keys())), flags=re.IGNORECASE|re.DOTALL)
    def get_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded = map.get(match) if map.get(match) else map.get(match.lower())
        expanded = first_char+expanded[1:]
        return expanded 
    new_text = pattern.sub(get_match, text)
    new_text = re.sub("'", "", new_text)
    return new_text

In [16]:
# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

In [17]:
# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

In [18]:
# function to remove punctuation
def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

In [19]:
# function for stemming
def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [20]:
# function for Lemmatization
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [21]:
# function to remove stopwords
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

In [22]:
# function to remove whitespaces and tabs
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

In [23]:
# function to get lowercase characters
def to_lowercase(text):
    return text.lower()

In [24]:
# Remove HTML Tags
rows = []
for t in comment_frame['textDisplay']:
    soup = BeautifulSoup(t,"lxml")
    rows.append(soup.get_text())
comment_frame['textDisplay'] = rows

  ' Beautiful Soup.' % markup)


In [25]:
comment_frame.textDisplay = comment_frame.textDisplay.apply(lambda x:str(x).replace("’","'"))

In [26]:
for i in comment_frame.index:
    comment_frame.loc[i,'textDisplay'] = expand_contractions(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_accented_chars(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_special_characters(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_numbers(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_punctuation(comment_frame.loc[i,'textDisplay'])
    # comment_frame.loc[i,'textDisplay'] = get_stem(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = get_lem(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_stopwords(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_extra_whitespace_tabs(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = to_lowercase(comment_frame.loc[i,'textDisplay'])
    print(comment_frame.loc[i,'textDisplay']+"\n")

hi patrick access video website patron already not sure need pay separately please let know lot work get blush

challenge u try indian jeeadvanced physics math

make look easy approve

ur voice changeee alot

want tell video help much college part reason graduate industrial engineering degree year ago thank

solve question iit jee question paper

like si estas haciendo las tareas te aburre viendo los comentario

sir long post wait post million subscriber eagerly wait upload new video else wait check channel friend

mans math video year

message directly

welcome back

hello

thank check new tutorial video

nice

u great reach million subscriber math ur great math teacher

sir indiadoe syllabus cover jee main entrence exam india syllabus sir please let know

pretty useful

remember watch video problem trig differentiation back uni lecturer teach teach way better every evening would sit front laptop pull video allow teach wonderful everything make sense explain year since leave uni want 

In [27]:
comment_frame.head(100)

Unnamed: 0,videoTitle,textDisplay,likeCount,replyCount
0,2018 AP Physics 1 Free Response Question #1,hi patrick access video website patron already...,0,0.0
1,2018 AP Physics 1 Free Response Question #1,challenge u try indian jeeadvanced physics math,0,0.0
2,2018 AP Physics 1 Free Response Question #1,make look easy approve,0,0.0
3,2018 AP Physics 1 Free Response Question #1,ur voice changeee alot,0,0.0
4,2018 AP Physics 1 Free Response Question #1,want tell video help much college part reason ...,0,0.0
...,...,...,...,...
95,Electrostatic Force from a Charged Rod on a Co...,voice change video year ago,2,3.0
96,Electrostatic Force from a Charged Rod on a Co...,well explain lot,0,
97,Electrostatic Force from a Charged Rod on a Co...,pssst read video description,2,
98,Electrostatic Force from a Charged Rod on a Co...,literally come year old video well haha,1,


## Attempting Word Embedding Using BERT

In [28]:
!rm -rf bert
!git clone https://github.com/google-research/bert
import sys
sys.path.append('bert/')
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import json
import re
import os
import pprint
import numpy as np
import tensorflow as tf
import modeling
import tokenization

Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 315.49 KiB | 4.10 MiB/s, done.
Resolving deltas: 100% (185/185), done.


In [None]:
!pip install tensorflow==1.14



In [None]:
assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.61.47.106:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 1932497918232056258),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 5414164225379671449),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 2179187364776523648),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 17526382095955371411),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 12221191950648903728),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 16043212512870578995),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 15123530711102908162),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 4042815912776243137),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 809367986865

In [None]:
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12 *****
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_config.json
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.index
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.meta
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/checkpoint
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/vocab.txt


In [None]:
LAYERS = [-1,-2,-3,-4]
NUM_TPU_CORES = 8
MAX_SEQ_LENGTH = 128
BERT_CONFIG = BERT_PRETRAINED_DIR + '/bert_config.json'
CHKPT_DIR = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
VOCAB_FILE = BERT_PRETRAINED_DIR + '/vocab.txt'
INIT_CHECKPOINT = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
BATCH_SIZE = 128

In [None]:
class InputExample(object):

  def __init__(self, unique_id, text_a, text_b=None):
    self.unique_id = unique_id
    self.text_a = text_a
    self.text_b = text_b

In [None]:
class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
    self.unique_id = unique_id
    self.tokens = tokens
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.input_type_ids = input_type_ids

In [None]:
def input_fn_builder(features, seq_length):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_unique_ids = []
  all_input_ids = []
  all_input_mask = []
  all_input_type_ids = []

  for feature in features:
    all_unique_ids.append(feature.unique_id)
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_input_type_ids.append(feature.input_type_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "unique_ids":
            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_type_ids":
            tf.constant(
                all_input_type_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
    })

    d = d.batch(batch_size=batch_size, drop_remainder=False)
    return d

  return input_fn
  
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    input_type_ids = features["input_type_ids"]

    model = modeling.BertModel(
        config=bert_config,
        is_training=False,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=input_type_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    if mode != tf.estimator.ModeKeys.PREDICT:
      raise ValueError("Only PREDICT modes are supported: %s" % (mode))

    tvars = tf.trainable_variables()
    scaffold_fn = None
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)
    if use_tpu:

      def tpu_scaffold():
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        return tf.train.Scaffold()

      scaffold_fn = tpu_scaffold
    else:
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    all_layers = model.get_all_encoder_layers()

    predictions = {
        "unique_id": unique_ids,
    }

    for (i, layer_index) in enumerate(layer_indexes):
      predictions["layer_output_%d" % i] = all_layers[layer_index]

    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

In [None]:
def convert_examples_to_features(examples, seq_length, tokenizer):
  """Loads a data file into a list of `InputBatch`s."""

  features = []
  for (ex_index, example) in enumerate(examples):
    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
      tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
      # Modifies `tokens_a` and `tokens_b` in place so that the total
      # length is less than the specified length.
      # Account for [CLS], [SEP], [SEP] with "- 3"
      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
      # Account for [CLS] and [SEP] with "- 2"
      if len(tokens_a) > seq_length - 2:
        tokens_a = tokens_a[0:(seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
      tokens.append(token)
      input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
      for token in tokens_b:
        tokens.append(token)
        input_type_ids.append(1)
      tokens.append("[SEP]")
      input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
      input_ids.append(0)
      input_mask.append(0)
      input_type_ids.append(0)

    assert len(input_ids) == seq_length
    assert len(input_mask) == seq_length
    assert len(input_type_ids) == seq_length

    if ex_index < 5:
      tf.logging.info("*** Example ***")
      tf.logging.info("unique_id: %s" % (example.unique_id))
      tf.logging.info("tokens: %s" % " ".join(
          [tokenization.printable_text(x) for x in tokens]))
      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
      tf.logging.info(
          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

    features.append(
        InputFeatures(
            unique_id=example.unique_id,
            tokens=tokens,
            input_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids))
  return features

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

In [None]:
def read_sequence(input_sentences):
  examples = []
  unique_id = 0
  for sentence in input_sentences:
    line = tokenization.convert_to_unicode(sentence)
    examples.append(InputExample(unique_id=unique_id, text_a=line))
    unique_id += 1
  return examples

In [None]:
test = read_sequence("This is a test")
print(test)

[<__main__.InputExample object at 0x7f83a2995ac8>, <__main__.InputExample object at 0x7f83a2995828>, <__main__.InputExample object at 0x7f83a29957b8>, <__main__.InputExample object at 0x7f83a2995550>, <__main__.InputExample object at 0x7f83a2995390>, <__main__.InputExample object at 0x7f83a2995860>, <__main__.InputExample object at 0x7f83a29950f0>, <__main__.InputExample object at 0x7f83a2995f98>, <__main__.InputExample object at 0x7f83a2995320>, <__main__.InputExample object at 0x7f83a2995ba8>, <__main__.InputExample object at 0x7f83a29959e8>, <__main__.InputExample object at 0x7f83a2995ef0>, <__main__.InputExample object at 0x7f83a2995748>, <__main__.InputExample object at 0x7f83a2995898>]


In [None]:
def get_features(input_text, dim=768):
  layer_indexes = LAYERS

  bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=VOCAB_FILE, do_lower_case=True)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=is_per_host))

  examples = read_sequence(input_text)

  features = convert_examples_to_features(
      examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

  unique_id_to_feature = {}
  for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      layer_indexes=layer_indexes,
      use_tpu=True,
      use_one_hot_embeddings=True)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE,
      train_batch_size=BATCH_SIZE)

  input_fn = input_fn_builder(
      features=features, seq_length=MAX_SEQ_LENGTH)

  # Get features
  for result in estimator.predict(input_fn, yield_single_examples=True):
    unique_id = int(result["unique_id"])
    feature = unique_id_to_feature[unique_id]
    output = collections.OrderedDict()
    for (i, token) in enumerate(feature.tokens):
      layers = []
      for (j, layer_index) in enumerate(layer_indexes):
        layer_output = result["layer_output_%d" % j]
        layer_output_flat = np.array([x for x in layer_output[i:(i + 1)].flat])
        layers.append(layer_output_flat)
      output[token] = sum(layers)[:dim]
  
  return output

In [None]:
# embeddings = get_features(["This is a test"], dim=50)
# print(embeddings)


INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] this is a test [SEP]
INFO:tensorflow:input_ids: 101 2023 2003 1037 3231 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:

In [None]:
# for i in comment_frame.index:
#     comment_frame.loc[i,'textDisplay'] = get_features(comment_frame.loc[i,'textDisplay'])

# comment_frame['textDisplay'] = get_features(comment_frame['textDisplay'])
test = get_features(comment_frame.loc[3,'textDisplay'], dim=20)

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] m [SEP]
INFO:tensorflow:input_ids: 101 1049 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:*** Example ***
INFO:te

In [None]:
test

OrderedDict([('[CLS]',
              array([-3.943468  , -0.11929114,  0.21064997, -2.028554  , -0.13769454,
                     -0.35921943, -0.45333827,  2.3047674 ,  3.6556015 , -2.5032725 ,
                     -1.8145667 ,  2.960709  ,  0.49872565,  0.3502461 ,  2.3230443 ,
                      1.833336  , -1.6167967 ,  3.6785684 ,  0.807667  , -2.3961706 ],
                    dtype=float32)),
             ('y',
              array([-1.5616386 , -1.1528176 ,  2.6689548 , -1.4010224 ,  3.2452426 ,
                      1.9289775 ,  1.9088236 , -0.5494617 ,  2.4309502 , -4.4525895 ,
                     -2.5302749 ,  0.4559427 ,  3.459109  ,  0.70544237,  2.369423  ,
                      2.6214504 ,  2.1887753 ,  3.2853317 ,  1.1144626 ,  4.4986067 ],
                    dtype=float32)),
             ('[SEP]',
              array([ 9.7592050e-01, -6.8217190e-04, -3.9866906e-01,  5.7528400e-01,
                     -6.1698270e-01, -4.5007566e-01,  5.3871977e-01, -8.3822995e-01,
 

In [None]:
comment_frame.loc[3,'textDisplay']

'message directly'

In [None]:
get_features("message directly", dim=20)

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] m [SEP]
INFO:tensorflow:input_ids: 101 1049 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:*** Example ***
INFO:te

OrderedDict([('[CLS]',
              array([-3.943468  , -0.11929114,  0.21064997, -2.028554  , -0.13769454,
                     -0.35921943, -0.45333827,  2.3047674 ,  3.6556015 , -2.5032725 ,
                     -1.8145667 ,  2.960709  ,  0.49872565,  0.3502461 ,  2.3230443 ,
                      1.833336  , -1.6167967 ,  3.6785684 ,  0.807667  , -2.3961706 ],
                    dtype=float32)),
             ('y',
              array([-1.5616386 , -1.1528176 ,  2.6689548 , -1.4010224 ,  3.2452426 ,
                      1.9289775 ,  1.9088236 , -0.5494617 ,  2.4309502 , -4.4525895 ,
                     -2.5302749 ,  0.4559427 ,  3.459109  ,  0.70544237,  2.369423  ,
                      2.6214504 ,  2.1887753 ,  3.2853317 ,  1.1144626 ,  4.4986067 ],
                    dtype=float32)),
             ('[SEP]',
              array([ 9.7592050e-01, -6.8217190e-04, -3.9866906e-01,  5.7528400e-01,
                     -6.1698270e-01, -4.5007566e-01,  5.3871977e-01, -8.3822995e-01,
 

### I don't like above result. Attempting to use BERT for classification again:

In [None]:
!pip install transformers

# Requires the latest pip
# !pip install --upgrade pip

# Current stable release for CPU and GPU
!pip install tensorflow==2.3

Collecting tensorflow==2.3
  Using cached tensorflow-2.3.0-cp36-cp36m-manylinux2010_x86_64.whl (320.4 MB)
Collecting tensorboard<3,>=2.3.0
  Using cached tensorboard-2.4.0-py3-none-any.whl (10.6 MB)
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Using cached tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
Installing collected packages: tensorflow-estimator, tensorboard, tensorflow
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 1.14.0
    Uninstalling tensorflow-estimator-1.14.0:
      Successfully uninstalled tensorflow-estimator-1.14.0
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 1.14.0
    Uninstalling tensorboard-1.14.0:
      Successfully uninstalled tensorboard-1.14.0
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 1.14.0
    Uninstalling tensorflow-1.14.0:
      Successfully uninstalled tensorflow-1.14.0
Successfully installed tensorboard-2.4.0 tensorfl

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


### Hmmm... Another big NOPE, let's try take 3:
https://towardsdatascience.com/pre-trained-word-embedding-for-text-classification-end2end-approach-5fbf5cd8aead

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Limit on the number of features to K features.
TOP_K = 20000

# Limit on the length of text sequences. 
# Sequences longer than this will be truncated.
# and less than it will be padded
MAX_SEQUENCE_LENGTH = 50

class CustomTokenizer:
    def __init__(self, train_texts):
        self.train_texts = train_texts
        self.tokenizer = Tokenizer(num_words=TOP_K)
        
    def train_tokenize(self):
        # Get max sequence length.
        max_length = len(max(self.train_texts , key=len))
        self.max_length = min(max_length, MAX_SEQUENCE_LENGTH)
    
        # Create vocabulary with training texts.
        self.tokenizer.fit_on_texts(self.train_texts)
        
    def vectorize_input(self, comments):
        # Vectorize training and validation texts.
        
        comments = self.tokenizer.texts_to_sequences(comments)
        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        comments = sequence.pad_sequences(comments, maxlen=self.max_length, truncating='post',padding='post')
        return comments
    
tokenizer = CustomTokenizer(train_texts = comment_frame['textDisplay'])
# fit o the train
tokenizer.train_tokenize()
tokenized_train = tokenizer.vectorize_input(comment_frame['textDisplay'])
tokenized_val = tokenizer.vectorize_input(comment_frame['textDisplay'])
tokenized_test = tokenizer.vectorize_input(comment_frame['textDisplay'])

In [None]:
import os
import tqdm
import requests
import zipfile
URL = "http://nlp.stanford.edu/data/glove.42B.300d.zip"

def fetch_data(url=URL, target_file='glove.zip', delete_zip=False):
    #if the dataset already exists exit
    if os.path.isfile(target_file):
        print("datasets already downloded :) ")
        return

    #download (large) zip file
    #for large https request on stream mode to avoid out of memory issues
    #see : http://masnun.com/2016/09/18/python-using-the-requests-module-to-download-large-files-efficiently.html
    print("**************************")
    print("  Downloading zip file")
    print("  >_<  Please wait >_< ")
    print("**************************")
    response = requests.get(url, stream=True)
    #read chunk by chunk
    handle = open(target_file, "wb")
    for chunk in tqdm.tqdm(response.iter_content(chunk_size=512)):
        if chunk:  
            handle.write(chunk)
    handle.close()  
    print("  Download completed ;) :") 
    #extract zip_file
    zf = zipfile.ZipFile(target_file)
    print("1. Extracting {} file".format(target_file))
    zf.extractall()
    if delete_zip:
        print("2. Deleting {} file".format(dataset_name+".zip"))
        os.remove(path=zip_file)

fetch_data()

**************************
  Downloading zip file
  >_<  Please wait >_< 
**************************


3667580it [14:33, 4201.00it/s]


  Download completed ;) :
1. Extracting glove.zip file


In [None]:
glove_file = "glove.42B.300d.txt"
import tqdm

EMBEDDING_VECTOR_LENGTH = 50 # <=200
def construct_embedding_matrix(glove_file, word_index):
    embedding_dict = {}
    with open(glove_file,'r') as f:
        for line in f:
            values=line.split()
            # get the word
            word=values[0]
            if word in word_index.keys():
                # get the vector
                vector = np.asarray(values[1:], 'float32')
                embedding_dict[word] = vector
    ###  oov words (out of vacabulary words) will be mapped to 0 vectors

    num_words=len(word_index)+1
    #initialize it to 0
    embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))

    for word,i in tqdm.tqdm(word_index.items()):
        if i < num_words:
            vect=embedding_dict.get(word, [])
            if len(vect)>0:
                embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
    return embedding_matrix
  
embedding_matrix =  construct_embedding_matrix(glove_file, tokenizer.tokenizer.word_index)

100%|██████████| 1217/1217 [00:00<00:00, 211347.63it/s]


In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.46529001,  0.1063    , -0.29214999, ...,  0.026225  ,
        -0.23901001, -0.052543  ],
       [-0.022975  ,  0.087888  , -0.24247999, ..., -0.23213001,
        -0.072726  , -0.18074   ],
       ...,
       [-0.35025999,  0.13618   , -0.24282999, ..., -0.29969001,
        -0.68023002,  0.042243  ],
       [-0.14188001,  0.10998   ,  0.15479   , ..., -0.31703001,
         0.090381  , -0.29243001],
       [-0.22966   , -0.0041264 , -0.21813001, ...,  0.50888002,
        -0.54857999, -0.60488999]])

In [None]:
model=Sequential()
embedding=Embedding(len(tokenizer.tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False)
model.add(embedding)

In [None]:
# compile the model
optimzer = Adam(clipvalue=0.5) # clip value to avoid the gradient exploding

model.compile(optimizer=optimzer)

In [None]:
# tokenized_train = np.asarray(tokenized_train).astype('float32')
test = model.fit(tokenized_train, labels, batch_size=32, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
embedding

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fa77cf5ac88>

AttributeError: ignored

In [None]:
labels = np.ones(482)

### FML... Trail #4

In [None]:
!pip3 install tensorflow-gpu==1.15



In [None]:
!pip3 install -U bert-serving-server bert-serving-client

Collecting bert-serving-client
  Downloading bert_serving_client-1.10.0-py2.py3-none-any.whl (28 kB)
Collecting bert-serving-server
  Downloading bert_serving_server-1.10.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 156 kB/s 
Collecting GPUtil>=1.3.0
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7409 sha256=36e065276874a448ad21c17a2f642e1f0e9543187e52d646bbd8e277b27974c5
  Stored in directory: /root/.cache/pip/wheels/79/c1/b2/b6fc2647f693a084da25e1d31328ab3dbb565cc58fea37e973
Successfully built GPUtil
Installing collected packages: GPUtil, bert-serving-server, bert-serving-client
Successfully installed GPUtil-1.4.0 bert-serving-client-1.10.0 bert-serving-server-1.10.0


In [None]:
bert-serving-start -model_dir /multi_cased_L-12_H-768_A-12/ -num_worker=1

SyntaxError: ignored

In [None]:
from bert_serving.client import BertClient

In [None]:
client = BertClient

In [None]:
vectors = client.encode(["dog"],["cat"],["man"])

AttributeError: ignored

### Let's try attempt #5
https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca   
https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf

In [29]:
!pip install pytorch_transformers

Collecting pytorch_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |█▉                              | 10kB 15.3MB/s eta 0:00:01[K     |███▊                            | 20kB 21.5MB/s eta 0:00:01[K     |█████▋                          | 30kB 13.7MB/s eta 0:00:01[K     |███████▍                        | 40kB 10.8MB/s eta 0:00:01[K     |█████████▎                      | 51kB 5.4MB/s eta 0:00:01[K     |███████████▏                    | 61kB 6.1MB/s eta 0:00:01[K     |█████████████                   | 71kB 6.4MB/s eta 0:00:01[K     |██████████████▉                 | 81kB 6.6MB/s eta 0:00:01[K     |████████████████▊               | 92kB 7.0MB/s eta 0:00:01[K     |██████████████████▋             | 102kB 7.3MB/s eta 0:00:01[K     |████████████████████▍           | 112kB 7.3MB/s eta 0:00:01[K     |██████████████████████▎  

In [30]:
import torch
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel
## Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)

100%|██████████| 231508/231508 [00:00<00:00, 2537049.05B/s]
100%|██████████| 433/433 [00:00<00:00, 206731.20B/s]
100%|██████████| 440473133/440473133 [00:06<00:00, 68065396.30B/s]


In [31]:
reference = comment_frame

In [32]:
rows = []
for i in comment_frame.index:
    marked_text = "[CLS] " + comment_frame.loc[i,'textDisplay'] + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    rows.append(indexed_tokens)
    

In [33]:
pd.DataFrame(rows)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84
0,101,7632,4754.0,3229.0,2678.0,4037.0,9161.0,2525.0,2025.0,2469.0,2342.0,3477.0,10329.0,3531.0,2292.0,2113.0,2843.0,2147.0,2131.0,16688.0,102.0,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,101,4119,1057.0,3046.0,2796.0,15333.0,13775.0,21789.0,2094.0,5584.0,8785.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,101,2191,2298.0,3733.0,14300.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,101,24471,2376.0,2689.0,4402.0,2632.0,4140.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,101,2215,2425.0,2678.0,2393.0,2172.0,2267.0,2112.0,3114.0,4619.0,3919.0,3330.0,3014.0,2095.0,3283.0,4067.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,101,4067,7433.0,5470.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
492,101,4007,3499.0,4339.0,10785.0,4339.0,7123.0,3259.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
493,101,2228,6213.0,5254.0,2191.0,7615.0,2034.0,2117.0,2678.0,5584.0,2377.0,9863.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
494,101,7592,4754.0,4067.0,2147.0,6429.0,2156.0,2393.0,2092.0,4824.0,4553.0,2465.0,26478.0,8609.0,4067.0,2843.0,102.0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [34]:
import torch
# # Convert inputs to PyTorch tensors
# tokens_tensor = torch.tensor([indexed_tokens])
# # Put the model in "evaluation" mode,meaning feed-forward operation.
# model.eval()

In [35]:
rows = []
for i in comment_frame.index:
    marked_text = "[CLS] " + comment_frame.loc[i,'textDisplay'] + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_tensor)
        last_hidden_state = outputs[0]
        word_embed_1 = last_hidden_state
        
    rows.append(word_embed_1)

KeyboardInterrupt: ignored

In [36]:
rows = []
sentences = []
for i in comment_frame.index:
    marked_text = "[CLS] " + comment_frame.loc[i,'textDisplay'] + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_tensor)
        last_hidden_state = outputs[0]
        hidden_states = outputs[2]
        word_embed_1 = last_hidden_state
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
    rows.append(word_embed_1)
    sentences.append(sentence_embedding)

In [None]:
# #Run the text through BERT, get the output and collect all of the hidden states produced from all 12 layers.
# with torch.no_grad():
#     outputs = model(tokens_tensor)
# # can use last hidden state as word embeddings
#     last_hidden_state = outputs[0]
#     word_embed_1 = last_hidden_state
# # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers. See the documentation for more details:https://huggingface.co/transformers/model_doc/bert.html#bertmodel
# hidden_states = outputs[2]
# # initial embeddings can be taken from 0th layer of hidden states
# word_embed_2 = hidden_states[0]
# # sum of all hidden states
# word_embed_3 = torch.stack(hidden_states).sum(0)
# # sum of second to last layer
# word_embed_4 = torch.stack(hidden_states[2:]).sum(0)
# # sum of last four layer
# word_embed_5 = torch.stack(hidden_states[-4:]).sum(0)
# # concatenate last four layers
# word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)

In [37]:
size = []
for i in range(len(rows)):
  x = sentences[i].shape
  size.append(x)

In [63]:
size

[torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),


In [38]:
import torch
import pandas as pd
# px = tens.numpy()
# px = pd.DataFrame(px)

X = torch.stack(sentences).numpy()

In [72]:
X

array([[ 0.18526408, -0.3198621 ,  1.1442171 , ..., -1.0674597 ,
        -0.01943914, -0.38356677],
       [-0.24321043,  0.05064999,  0.2188183 , ..., -0.2407515 ,
         0.02934108, -0.04801563],
       [ 0.09535228, -0.38166904, -0.25952825, ..., -0.43941876,
        -0.27586088, -0.01403544],
       ...,
       [ 0.18814184,  0.12490464,  0.53400296, ..., -0.35783452,
        -0.2171457 , -0.4894737 ],
       [ 0.1241234 ,  0.46040916,  0.86902606, ..., -0.42872643,
        -0.23759724, -0.20243706],
       [ 0.51271105, -0.08290875,  0.26565555, ..., -0.589869  ,
         0.06097335, -0.07700414]], dtype=float32)

In [39]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,0.185264,-0.319862,1.144217,-0.242548,0.407622,-0.362151,0.399397,0.484918,-0.301818,-0.473685,0.112421,0.259411,0.517961,0.239896,-0.738113,0.414046,-0.331210,0.536272,0.058449,0.254510,0.579642,0.091564,0.392794,-0.204572,0.388548,-0.360492,-0.203703,-0.237434,-0.470980,-0.144755,0.405532,0.040461,0.298030,-0.219569,-0.408821,-0.357672,0.129359,0.481220,0.018194,0.336492,...,-0.152708,-0.270259,0.096580,0.190272,0.015740,-0.955690,-0.682294,0.038618,0.047051,0.044342,-0.176693,0.379682,0.514489,-0.494530,-0.157163,-0.258607,-0.223811,0.127018,0.601644,0.453995,-0.021088,-0.081721,0.307488,0.443762,0.137420,0.367998,0.547352,0.265394,-0.610446,-0.799802,-0.423547,-0.239989,0.053932,0.183351,-0.176222,0.360571,-0.645178,-1.067460,-0.019439,-0.383567
1,-0.243210,0.050650,0.218818,-0.045982,0.222750,-0.150218,0.430692,0.302426,-0.261301,-0.123137,0.001276,-0.258989,0.341154,0.239959,-0.169884,0.437237,-0.008992,0.087678,-0.130470,0.253622,0.324785,0.327099,0.066579,0.059926,0.254827,0.153727,-0.016911,0.243696,-0.269121,0.145740,-0.035366,-0.004852,0.120347,-0.125721,-0.705768,-0.267575,-0.317607,0.529252,-0.158837,0.227377,...,-0.499201,-0.199872,0.101315,0.268786,-0.133331,-0.214159,-0.601334,0.009458,-0.213255,-0.061895,-0.266194,0.237485,0.188711,0.062355,-0.201612,-0.241235,-0.234515,0.064382,-0.249130,-0.318537,0.132263,0.014832,-0.034574,0.439283,-0.001786,-0.192893,-0.086980,0.638781,-0.500376,-0.367342,0.078546,-0.025593,0.041580,-0.015584,-0.211712,0.677696,-0.208551,-0.240752,0.029341,-0.048016
2,0.095352,-0.381669,-0.259528,0.039475,0.449752,0.083895,0.190426,0.329415,0.062201,-0.619301,0.079631,-0.146194,-0.102030,-0.354493,-0.946233,0.101479,-0.270910,0.317752,0.257359,0.107744,-0.054110,-0.016256,-0.334970,-0.144010,0.203724,0.166920,-0.262792,-0.193288,-0.222385,0.029708,0.409368,0.025390,-0.081318,-0.149702,-0.498189,-0.008062,0.154458,0.704165,-0.438897,0.214360,...,0.477366,-0.055983,0.146587,0.391938,0.021745,-0.896020,-0.203441,0.405950,-0.161734,-0.223770,-0.079324,-0.116008,0.088450,0.356601,0.037049,0.301011,-0.013374,-0.101825,0.017275,-0.083420,0.126655,0.272151,-0.087973,-0.237594,-0.166162,-0.311046,-0.290840,-0.298107,-0.134351,0.494504,0.008140,-0.091129,0.146747,0.103413,-0.140196,0.273203,-0.264838,-0.439419,-0.275861,-0.014035
3,-0.294832,0.482308,0.330702,-0.350925,0.116761,0.042012,0.100142,0.182055,0.080515,-0.835674,-0.147596,-0.191286,0.344821,0.476255,-0.100143,0.712548,-0.096163,0.618671,-0.307833,-0.559198,0.779727,-0.261583,0.039820,0.020563,0.261506,-0.020587,-0.155915,-0.103927,-0.300364,0.086671,-0.031429,0.269832,-0.423210,-0.244134,-0.554314,-0.270933,0.075438,0.189443,-0.458042,0.248168,...,-0.615516,0.475029,-0.555310,0.007105,-0.043171,-0.032358,0.307549,0.074536,0.398048,0.330261,0.011272,0.061922,0.190698,-0.914785,0.006078,0.123048,-0.457553,0.051112,0.679342,-0.001870,0.298688,0.219033,-0.678852,0.317655,0.142708,0.041898,0.080920,0.146710,-0.654641,-0.698492,0.228040,-0.507689,-0.081494,0.076387,-0.260864,0.093972,-0.042057,-0.582289,-0.123058,0.017650
4,-0.151916,0.169007,0.874950,-0.488454,0.506542,0.039273,0.367727,0.343341,-0.201612,-0.102753,0.293927,0.144865,0.649835,0.755830,-0.737865,0.156678,0.241690,0.630437,0.102130,-0.093919,0.485726,0.281132,0.409138,-0.242635,-0.027789,-0.203900,-0.136482,0.169903,-0.273086,-0.146964,-0.109853,-0.202222,0.051726,0.270482,-0.516160,-0.727923,-0.237054,0.753132,-0.428898,0.233996,...,-0.594306,-0.523594,0.017074,0.172297,-0.125678,-0.643492,-0.792215,0.436914,0.248124,-0.266826,-0.603818,0.139978,0.598660,0.024786,-0.114409,0.016357,-0.379971,0.144856,0.410016,-0.012264,-0.167426,0.185195,-0.210573,0.162227,0.098782,0.315670,0.586803,0.159003,-0.477683,-1.295109,-0.089619,-0.609625,0.161184,0.284161,-0.497508,0.192652,-0.504538,-1.153599,-0.049596,-0.267383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,0.061426,-0.103228,0.148945,-0.147971,-0.090533,-0.425955,0.047204,0.449874,-0.090886,-0.458148,-0.117244,-0.041449,0.125855,0.042094,-0.405128,0.467221,-0.277282,0.336938,0.163797,-0.139394,-0.088504,0.033334,-0.174043,0.107203,0.081907,-0.159314,-0.187445,0.041123,-0.329938,0.590306,0.423608,-0.094890,0.266232,0.175144,-0.688509,-0.280642,-0.315544,0.496848,-0.829386,0.202610,...,-0.131894,-0.261135,0.287192,0.649754,-0.176587,-0.375239,-0.427643,0.045354,0.089440,0.177733,-0.436939,0.011249,-0.185923,0.466657,0.111064,-0.176553,0.236508,-0.410117,-0.189510,-0.169568,-0.115991,0.485555,0.073530,-0.542907,-0.290680,-0.052049,-0.033251,-0.112457,-0.008136,0.237469,-0.071214,-0.022300,0.196612,0.073983,0.249597,0.572150,-0.522115,0.060003,0.078456,0.040098
492,0.309810,0.350016,0.235684,0.337579,0.495220,0.068775,-0.552308,0.243201,-0.226491,-0.408352,-0.051673,-0.144769,-0.155776,0.134611,-0.900221,-0.060202,-0.497615,0.118738,0.414987,0.210628,-0.040688,-0.224809,0.099810,-0.468527,-0.043862,-0.167583,0.120470,-0.018117,-0.136020,-0.366724,-0.032687,0.564717,0.460788,-0.476947,0.055025,-0.152842,-0.077399,0.456326,-0.391341,0.252328,...,0.004474,-0.277239,0.297262,0.124306,0.105912,-0.388479,-0.006579,-0.004320,-0.057058,-0.391239,-0.383661,0.243326,0.408830,0.246004,-0.095000,0.185376,-0.174013,0.153225,0.025725,-0.150010,-0.054636,0.087044,0.430611,-0.045643,0.128763,0.010793,-0.183577,-0.123181,-0.188222,-0.102517,0.631200,-0.346168,-0.172346,0.121182,-0.096986,0.594823,-0.418143,-0.445300,-0.383739,0.264121
493,0.188142,0.124905,0.534003,-0.083996,0.393117,0.077568,0.082447,0.256764,-0.223938,-0.368796,0.069685,0.087836,0.094270,0.173311,-0.408898,0.456223,-0.557721,0.288910,-0.110416,0.037312,0.539514,0.321863,0.246620,-0.076111,0.477572,-0.044801,0.044852,0.389175,-0.219257,-0.096034,-0.041739,0.128937,0.439259,-0.173041,-0.168961,-0.412669,0.024304,0.354609,-0.107813,0.592440,...,-0.025156,-0.581539,0.073165,0.276943,0.034897,-0.758487,-0.860399,-0.231108,0.355776,-0.193070,-0.596683,0.483499,0.126601,-0.000280,-0.053610,-0.524654,-0.413035,0.600546,0.435527,-0.104247,-0.333147,0.344400,-0.081698,0.255003,-0.168766,0.218785,0.491864,0.405607,-0.553945,-0.845061,-0.075762,-0.468782,-0.149628,0.037765,-0.221649,0.350679,-0.387426,-0.357835,-0.217146,-0.489474
494,0.124123,0.460409,0.869026,-0.049716,0.355353,-0.179431,0.626300,0.572885,-0.218835,-0.886804,0.102477,0.047372,0.527814,0.102164,-0.408478,0.539461,-0.421349,0.385155,0.016180,0.116701,0.213175,0.501825,0.104801,-0.045269,0.318156,-0.147463,0.249037,-0.273394,-0.182957,-0.441964,0.401421,-0.197563,0.264824,-0.186842,-0.767768,-0.328705,-0.209309,0.323831,-0.225667,0.297340,...,-0.169718,-0.221252,0.200869,0.047039,0.272802,-0.546693,-0.605720,-0.154507,-0.079682,0.073714,0.065967,0.297478,0.145054,0.308341,0.252798,-0.206365,0.088690,0.033598,0.411121,0.209681,0.030919,0.298443,0.205924,0.507199,0.171677,0.305766,0.052789,0.167703,-0.362101,-0.346969,-0.175980,-0.428114,-0.170248,0.137750,0.161312,0.635985,-0.837474,-0.428726,-0.237597,-0.202437


In [40]:
pd.DataFrame(X).to_csv(r'C:\Users\Ted\Documents\Joey\word_embed_example.csv', index=False, header=False)

In [44]:
pd.DataFrame(video_frame).to_csv(r'video_frame.csv', index=False, header=False)