In [81]:
# Loading Packages
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from timeit import default_timer as timer
import pandas as pd
import numpy as np
import requests
import re

from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
import csv
import os
from os.path import exists

## Data Ingestion

In [82]:
# Secret keys and authorization
CLIENT_SECRETS_FILE = "client_secret.json"
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
DEVELOPER_KEY = "AIzaSyBAmYHKpB-g14rlihoODKApxs4CiE0iy9w"

In [83]:
def get_authenticated_service():
    #flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    #credentials = flow.run_console()
    print("Authenticating")
    return build(API_SERVICE_NAME, API_VERSION, developerKey = DEVELOPER_KEY)

In [84]:
def get_authenticated_service():
    #flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    #credentials = flow.run_console()
    print("Authenticating")
    return build(API_SERVICE_NAME, API_VERSION, developerKey = DEVELOPER_KEY)

def comments_list(service, part, parent_id):
    results = service.comments().list(
    parentId=parent_id,
    part=part
  ).execute()

    return results

def get_video_comments(service, channel_id, videoId, link, **kwargs):
    comments = []

    videoResult = service.videos().list(part='snippet,statistics', id=videoId).execute()
    
    # Getting Video Data
    for itemVideo in videoResult['items']:
        print(itemVideo)
        videoTitle = itemVideo['snippet']['title']
        videoTime = itemVideo['snippet']['publishedAt']
        #totalComments = itemVideo['statistics']['commentCount']
    
    try:
      results = service.commentThreads().list(videoId = videoId, **kwargs).execute()
    except (HttpError):
      return [], videoTitle, videoTime, 1

    # Check if comments are dissabled
    #if totalComments == 0:
    #  print('no comments for video: ' + videoTitle)
    #  return [], videoTitle, videoTime, 1

    # Flags are used for a different implementation that updates data instead of pulling fresh, can be ignored
    firstCommentFlag = 0 #this is a flag to determine if the comment is the first
    no_existing_data_flag = 0 #If no prior data exists do not run reply retrieval

    while results:
        for item in results['items']:
                
            linkToComment = link + item['id'] #Create Comment link
            
            try:
                #Creating the comment dictionary
                comment = {
                    # Video info
                    'videoTitle': videoTitle,
                    'videoTimePosted': videoTime,
                    'videoID': item['snippet']['topLevelComment']['snippet']['videoId'],
                    
                    # Author info
                    'authorDisplayName': item['snippet']['topLevelComment']['snippet']['authorDisplayName'],
                    'authorProfileImageUrl': item['snippet']['topLevelComment']['snippet']['authorProfileImageUrl'],
                    'authorChannelUrl':  item['snippet']['topLevelComment']['snippet']['authorChannelUrl'],
                    'authorID': item['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
                    
                    # Comment Info
                    'commentID': item['snippet']['topLevelComment']['id'],
                    'linkToComment': linkToComment,
                    'textDisplay': item['snippet']['topLevelComment']['snippet']['textDisplay'],
                    'parentID': None,
                    'viewerRating': item['snippet']['topLevelComment']['snippet']['viewerRating'],
                    'likeCount': item['snippet']['topLevelComment']['snippet']['likeCount'],
                    'replyCount': item['snippet']['totalReplyCount'],
                    'publishedAt': item['snippet']['topLevelComment']['snippet']['publishedAt'],
                    'isReply': False
                }

                comments.append(comment)

                # Check if comment contains replies
                replyValue = int(item['snippet']['totalReplyCount'])
                if replyValue > 0:
                    #if it contains replies, pull those replies as a comment type
                    replyThread = comments_list(service, part='id,snippet', parent_id=item['id'])
                    for reply in replyThread['items']:
                        linkToCommentReply = link + reply['id']
                        commentReply = {
                                        # Video info
                                        'videoTitle': videoTitle,
                                        'videoTimePosted': videoTime,
                                        'videoID': item['snippet']['topLevelComment']['snippet']['videoId'],
                                        
                                        # Author info
                                        'authorDisplayName': reply['snippet']['authorDisplayName'],
                                        'authorProfileImageUrl': reply['snippet']['authorProfileImageUrl'],
                                        'authorChannelUrl':  reply['snippet']['authorChannelUrl'],
                                        'authorID': reply['snippet']['authorChannelId']['value'],
                                        
                                        # Comment Info
                                        'commentID': reply['id'],
                                        'linkToComment': linkToCommentReply,
                                        'textDisplay': reply['snippet']['textDisplay'],
                                        'parentID': reply['snippet']['parentId'],
                                        'viewerRating': reply['snippet']['viewerRating'],
                                        'likeCount': reply['snippet']['likeCount'],
                                        'replyCount': None,
                                        'publishedAt': reply['snippet']['publishedAt'],
                                        'isReply': True
                                    }
                        comments.append(commentReply)
            
            except KeyError:
                print(linkToComment)

        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            try:
                results = service.commentThreads().list(videoId = videoId, **kwargs).execute()
            except:
                break
        else:
            break
    print('wrote')
    return comments, videoTitle, videoTime, no_existing_data_flag

def get_playlist(service, numberVids, **kwargs):
    videoPlaylist = service.channels().list(**kwargs).execute()
    videoListCurrent = []
    
    print("getting playlist")
    for playlists in videoPlaylist['items']:
        uploadID = playlists['contentDetails']['relatedPlaylists']['uploads']
        #print(uploadID)

    
    getVideos = service.playlistItems().list(part="snippet,contentDetails", playlistId = uploadID, maxResults = numberVids).execute()
    #print("getting videos")
    for uploads in getVideos['items']:
        videoGrab = uploads['contentDetails']['videoId']
        videoListCurrent.append(videoGrab)

    print(videoListCurrent)
    return videoListCurrent
    
def load_data(channel_id, numberVids):
    # When running locally, disable OAuthlib's HTTPs verification. When
    # running in production *do not* leave this option enabled.
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
    service = get_authenticated_service()


    videoTitle = ""
    videoList = get_playlist(service, numberVids, part="snippet, contentDetails", id=channel_id)

    counter = 0
    final_result = pd.DataFrame()
    for videoId in videoList:

        
        # videos = get_playlist(service, part='snippet', channelId=channelId,maxResults=25, textFormat='plainText')
        # videoId = "3rC76KaH4os"
        maxres = 100
        link = "https://www.youtube.com/watch?v=" + videoId + "&lc="
      
        comments, videoTitle, videoTime, no_existing_data_flag = get_video_comments(service, order="time", channel_id = channel_id, link = link, part='snippet', videoId=videoId, maxResults=maxres, textFormat='plainText')
        final_result = final_result.append(pd.DataFrame(comments), ignore_index=True)

    return final_result

def list_video_titles(load_data_list):
    return load_data_list['videoTitle'][0] 

def removeSpecial(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [85]:
patrickjmt_channelId = "UCFe6jenM1Bc54qtBsIJGRZQ"

In [86]:
service = get_authenticated_service()

Authenticating


In [87]:
video_frame = load_data(patrickjmt_channelId, 20)

Authenticating
getting playlist
['8ioTJQkiPO0', 'ZN6IH36ubFs', '7HxrwOWbiiM', '1tEZReWKCq4', 'UZQj79ktGBU', '1TjIUi3s5Ak', '4EmVy-iVjAI', 'umZ6LH4fyZU', 'L1HXk4Y1hqw', 'RDrtq7U4NR4', 'D8h2VaAIJWM', 'aVEmkjDRqe0', 'D0DeY0yoTIA', 'vpYLpgkTtzg', 'VVF45n5URC4', 'XfHHYi_Pwpc', 'dsv2cEgquiA', 'qAVPALkrcvw', 'FelSLO6zYSM', 'ERIfBN-zePQ']
{'kind': 'youtube#video', 'etag': 'E9U8oa4H04Uw4H9h31TilGEZF9w', 'id': '8ioTJQkiPO0', 'snippet': {'publishedAt': '2020-10-25T21:59:08Z', 'channelId': 'UCFe6jenM1Bc54qtBsIJGRZQ', 'title': '2018 AP Physics 1 Free Response Question #1', 'description': '2018 AP Physics 1 Free Response Question #1\nThanks to Jacob Bowman for making this video for my channel!', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/8ioTJQkiPO0/hqdefault.jpg', 'width': 480, 'height': 360}

In [88]:
# #Joey: I added this line to save file to my device, feel free to change path to get your own file

video_frame.to_excel('Video_frame.xlsx', index=False, header=True)

In [89]:
video_frame.head(100)

Unnamed: 0,videoTitle,videoTimePosted,videoID,authorDisplayName,authorProfileImageUrl,authorChannelUrl,authorID,commentID,linkToComment,textDisplay,parentID,viewerRating,likeCount,replyCount,publishedAt,isReply
0,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,м а у а,https://yt3.ggpht.com/ytc/AAUvwnjCIDBjAqNfLrwV...,http://www.youtube.com/channel/UCkEprriaaIEwaQ...,UCkEprriaaIEwaQUW4U8_-Vg,UgziL3Fi-WzD0HGRJhR4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Glad to see your still uploading it's been a r...,,none,0,0.0,2021-02-04T07:28:30Z,False
1,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Divya,https://yt3.ggpht.com/ytc/AAUvwnj_MWzktkxMK3iL...,http://www.youtube.com/channel/UCH0-w8zC832Rkm...,UCH0-w8zC832RkmAujjG6MXw,UgwkcwEAllQpl62tF0h4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,It's first time when YouTube is providing corr...,,none,2,1.0,2021-02-03T11:33:32Z,False
2,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Djouhaina Dadi,https://yt3.ggpht.com/ytc/AAUvwnjAPEgk6N2JuzRI...,http://www.youtube.com/channel/UCjSa4P9fPkFEe3...,UCjSa4P9fPkFEe3hgla8M5XA,UgwkcwEAllQpl62tF0h4AaABAg.9JIXWG2oOBV9JOLEg7limL,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,You see the level of this lesson,UgwkcwEAllQpl62tF0h4AaABAg,none,0,,2021-02-05T17:41:43Z,True
3,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Chimzy Emmanuel Tasie,https://yt3.ggpht.com/ytc/AAUvwnikwm6fdUW0f30b...,http://www.youtube.com/channel/UC1Uj1oTNlBoTTA...,UC1Uj1oTNlBoTTA_BU2jyhsA,UgzLq5JzBcitc1Uhcm54AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Please check your email!,,none,0,0.0,2021-01-31T19:55:05Z,False
4,2018 AP Physics 1 Free Response Question #1,2020-10-25T21:59:08Z,8ioTJQkiPO0,Omprakash Marri,https://yt3.ggpht.com/ytc/AAUvwng7VLMeZ_R8kl2S...,http://www.youtube.com/channel/UCnYt3PULMWAsqs...,UCnYt3PULMWAsqsEvr607o-g,UgxWJr0ooce0pDNu4yV4AaABAg,https://www.youtube.com/watch?v=8ioTJQkiPO0&lc...,Does AP mean Andhra Pradesh here.....??????\nI...,,none,0,0.0,2021-01-31T17:12:22Z,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,What Happens to Electrons in a Resistor?,2020-09-27T17:46:24Z,7HxrwOWbiiM,patrickJMT,https://yt3.ggpht.com/ytc/AAUvwngq2XjoaykTqb9r...,http://www.youtube.com/channel/UCFe6jenM1Bc54q...,UCFe6jenM1Bc54qtBsIJGRZQ,Ugzm8QlbpQjHbqEpG094AaABAg.9E7SjBuRkxt9E7T6zHB868,https://www.youtube.com/watch?v=7HxrwOWbiiM&lc...,Jacob is stellar!,Ugzm8QlbpQjHbqEpG094AaABAg,none,1,,2020-09-27T21:46:31Z,True
96,What Happens to Electrons in a Resistor?,2020-09-27T17:46:24Z,7HxrwOWbiiM,Maxamed Axmedn,https://yt3.ggpht.com/ytc/AAUvwnjRxFFUKM4UT8c9...,http://www.youtube.com/channel/UC9mKAmGKGKdFhD...,UC9mKAmGKGKdFhD5wXVwuqxQ,UgyEspamIf-3nUel9_94AaABAg,https://www.youtube.com/watch?v=7HxrwOWbiiM&lc...,🤗🤗🤗🤗🤗🤗❤❤❤👋👋👋👋,,none,0,0.0,2020-09-27T18:08:44Z,False
97,What Happens to Electrons in a Resistor?,2020-09-27T17:46:24Z,7HxrwOWbiiM,Kevin Dudson,https://yt3.ggpht.com/ytc/AAUvwniaSJHbIuPgQiSG...,http://www.youtube.com/channel/UCYCiWPAtHgVeM7...,UCYCiWPAtHgVeM7Wk1AAXbaA,UgyUQ0dqfwFLXLPPrxN4AaABAg,https://www.youtube.com/watch?v=7HxrwOWbiiM&lc...,What should I do without this channel I would ...,,none,2,0.0,2020-09-27T18:05:16Z,False
98,What Happens to Electrons in a Resistor?,2020-09-27T17:46:24Z,7HxrwOWbiiM,Nataspin900,https://yt3.ggpht.com/ytc/AAUvwnh8NxicZcjP6QDi...,http://www.youtube.com/channel/UCaiF1Oh7_5qSPi...,UCaiF1Oh7_5qSPiiGXNgar3w,UgzBnGlP99iD-1uEceN4AaABAg,https://www.youtube.com/watch?v=7HxrwOWbiiM&lc...,nice,,none,1,1.0,2020-09-27T17:57:54Z,False


In [90]:
comment_frame = video_frame.loc[:,['videoTitle','textDisplay','likeCount','replyCount']]

In [91]:
for i in comment_frame.index:
    print(comment_frame.loc[i,'textDisplay']+"\n")

Glad to see your still uploading it's been a real honor.  🌹

It's first time when YouTube is providing correct auto caption 😜 as this man is clearly explaining step by step!!

You see the level of this lesson

Please check your email!

Does AP mean Andhra Pradesh here.....??????
I mean i found this channel today only and it is a foreigner and why would he solve a paper from andhra pradesh in india  i mean its all kinda too real to believe....

And all the other videos on physics are also my syllabus for Andhra pradesh 12th class final boards exams along with IIT JEE mains and advanced exam( if u dont know what is jee then u should definitely search up coz jee is and indian engineering entrance exam which is the most toughest exam in the entire world)

NOT TURKHIS!

Shoutout Patrick, you made my varsity days with math so much easier. Thank you for all you do 👌🏾👌🏾

0=1  )))

can i use the rest room

This guy helped me through calculus, without him I’m nothing, thanks Pat

Hey, I would re

## Data PreProcessing

In [92]:
! pip install emojis



In [93]:
import emojis
comment_frame.textDisplay = comment_frame.textDisplay.apply(lambda x: emojis.decode(x).replace(':', ' ').replace('_', ' '))

In [94]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
# imports
from bs4 import BeautifulSoup
import unicodedata
# from contractions import CONTRACTION_MAP # from contractions.py
import re 
import string
import nltk
import spacy
nlp = spacy.load('en',parse=True,tag=True, entity=True)
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [96]:
# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

In [97]:
# function to expand contractions
def expand_contractions(text, map=CONTRACTION_MAP):
    pattern = re.compile('({})'.format('|'.join(map.keys())), flags=re.IGNORECASE|re.DOTALL)
    def get_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded = map.get(match) if map.get(match) else map.get(match.lower())
        expanded = first_char+expanded[1:]
        return expanded 
    new_text = pattern.sub(get_match, text)
    new_text = re.sub("'", "", new_text)
    return new_text

In [98]:
# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

In [99]:
# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

In [100]:
# function to remove punctuation
def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

In [101]:
# function for stemming
def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [102]:
# function for Lemmatization
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [103]:
# function to remove stopwords
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

In [104]:
# function to remove whitespaces and tabs
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

In [105]:
# function to get lowercase characters
def to_lowercase(text):
    return text.lower()

In [106]:
# Remove HTML Tags
rows = []
for t in comment_frame['textDisplay']:
    soup = BeautifulSoup(t,"lxml")
    rows.append(soup.get_text())
comment_frame['textDisplay'] = rows

  ' Beautiful Soup.' % markup)


In [107]:
comment_frame.textDisplay = comment_frame.textDisplay.apply(lambda x:str(x).replace("’","'"))

In [108]:
for i in comment_frame.index:
    comment_frame.loc[i,'textDisplay'] = expand_contractions(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_accented_chars(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_special_characters(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_numbers(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_punctuation(comment_frame.loc[i,'textDisplay'])
    # comment_frame.loc[i,'textDisplay'] = get_stem(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = get_lem(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_stopwords(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = remove_extra_whitespace_tabs(comment_frame.loc[i,'textDisplay'])
    comment_frame.loc[i,'textDisplay'] = to_lowercase(comment_frame.loc[i,'textDisplay'])
    print(comment_frame.loc[i,'textDisplay']+"\n")

glad see still upload real honor rise

first time youtube provide correct auto caption stick tongue wink eye man clearly explain step step

see level lesson

please check email

ap mean andhra pradesh mean find channel today foreigner would solve paper andhra pradesh india mean kinda real believe video physic also syllabus andhra pradesh th class final board exam along iit jee main advanced exam u not know jee u definitely search coz jee indian engineering entrance exam tough exam entire world

not turkhis

shoutout patrick make varsity day math much easy thank ok hand ok hand



use rest room

guy help calculus without nothing thank pat

hey would really appreciate put link description question topic cover video would help us

still listen mos def make

helpfuli use watch calculus first yearit really help thank

hi patrick access video website patron already not sure need pay separately please let know lot work get blush

challenge u try indian jeeadvanced physics math

make look easy

In [109]:
comment_frame.head(100)

Unnamed: 0,videoTitle,textDisplay,likeCount,replyCount
0,2018 AP Physics 1 Free Response Question #1,glad see still upload real honor rise,0,0.0
1,2018 AP Physics 1 Free Response Question #1,first time youtube provide correct auto captio...,2,1.0
2,2018 AP Physics 1 Free Response Question #1,see level lesson,0,
3,2018 AP Physics 1 Free Response Question #1,please check email,0,0.0
4,2018 AP Physics 1 Free Response Question #1,ap mean andhra pradesh mean find channel today...,0,0.0
...,...,...,...,...
95,What Happens to Electrons in a Resistor?,jacob stellar,1,
96,What Happens to Electrons in a Resistor?,hug hug hug hug hug hug wave wave wave wave,0,0.0
97,What Happens to Electrons in a Resistor?,without channel would boot university faster c...,2,0.0
98,What Happens to Electrons in a Resistor?,nice,1,1.0


In [110]:
comment_frame.to_excel('Processed Comments.xlsx')

## Training

In [111]:
# install ktrain on Google Colab
!pip3 install ktrain



In [112]:
video_frame['textDisplay'].to_csv('textDisplay.txt')

In [113]:
pip install pycorenlp



In [114]:
# !apt update -q
# !apt-get install -q openjdk-11-jdk-headless
# !curl -L https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip -o ijava-kernel.zip
# !unzip -q ijava-kernel.zip -d ijava-kernel && cd ijava-kernel && python3 install.py --sys-prefix
# !jupyter kernelspec list

In [115]:
import sklearn
import re
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

class IsQuestionAdvanced():
    
    # Init constructor
    # Input: Type of classification: 'MNB' - Multinomial Naive Bayes | 'SVM' - Support Vector Machine
    def __init__(self, classification_type):
        self.classification_type = classification_type
        df = self.__get_data()
        df = self.__clean_data(df)
        df = self.__label_encode(df)
        vectorizer_classifier = self.__create_classifier(df, self.classification_type)
        if vectorizer_classifier is not None:
            self.vectorizer = vectorizer_classifier['vectorizer']
            self.classifier = vectorizer_classifier['classifier']        
        
    # Method (Private):  __clean_data
    # Input: Raw input dataframe
    # Processing: 1. Rename column 
    # 2. lowercase text
    # 3. preserve alpha numeric characters, whitespace, apostrophe
    # 4. filter dataframe with question types - what, who, when, affirmation, unknown
    # Return: Processed filtered dataframe
    def __clean_data(self, df):
        df.rename(columns={0: 'text', 1: 'type'}, inplace=True)
        df['type'] = df['type'].str.strip()
        df['text'] = df['text'].apply(lambda x: x.lower())
        df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s\']','',x)))
        return df[(df['type'] == 'what') | (df['type'] == 'who') | (df['type'] == 'when') | (df['type'] == 'unknown') | (df['type'] == 'affirmation')]
    

    # Method (Private): __label_encode
    # Input: Processed dataframe
    # Processing: Use label encoding to convert text label to integer label and add it to a new column
    # Return: Processed dataframe with label encoding column
    def __label_encode(self, df):
        self.le = preprocessing.LabelEncoder()
        self.le.fit(df['type'])
        df['label'] = list(self.le.transform(df['type']))
        return df
    
    # Method (Private): __create_classifier
    # Input: 1. Processed dataframe 2. Type of classification
    # Processing: 1. Perform TFIDF Vectorization
    # 2. Appy fit_tranform using TFIDF on text column
    # 3. Split data into 70% training and 30% testing
    # 4. Perform Multinomial Naive Bayes OR SVM classifcation based on input provided
    # 5. Peform prediction for both classification techniques on test data
    # 6. Show confusion matrix and accuracy
    # Return: Dict - TFIDF Vetctorizer, Classifier    
    def __create_classifier(self, df, classification_type):
        v = TfidfVectorizer(analyzer='word',lowercase=True)
        X = v.fit_transform(df['text'])
        X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.30)
        if classification_type == 'MNB':
            clf = MultinomialNB()
            clf.fit(X_train,y_train)
            preds = clf.predict(X_test)
            print(classification_report(preds,y_test))
            print('Accuracy is: ', clf.score(X_test,y_test))
            return {'vectorizer': v, 'classifier': clf}
        elif classification_type == 'SVM':
            clf_svm = SVC(kernel='linear')
            clf_svm.fit(X_train,y_train)
            preds = clf_svm.predict(X_test)
            preds = print(classification_report(preds,y_test))
            print('Accuracy is: ', clf_svm.score(X_test,y_test))
            return {'vectorizer': v, 'classifier': clf_svm}
        else:
            print("Wrong classification type: \n Type 'MNB' - Multinomial Naive Bayes \n Type 'SVM' - Support Vector Machine")    
            

    # Method (Private): __get_data
    # Processing: Get the sample input data used to create traning, test, vectorizer, classifier data
    # Return: Pandas dataframe
    def __get_data(self):
        return pd.read_csv('sample.txt', sep=',,,', header=None)
    
    # Method (Public): predict
    # Input: An unknown new sentence
    # Return: Prediction - Typpe of question 'what', 'when', 'who'
    def predict(self, sentence):
        ex = self.vectorizer.transform([sentence])
        return list(self.le.inverse_transform(self.classifier.predict(ex)))[0]


obj = IsQuestionAdvanced('SVM')

# Run on output of first method
# df_method1_out = pd.read_csv('output/method1_output.csv')
# df_method1_out = df_method1_out[df_method1_out['is_question'] == 1]
# df_method1_out['question_type'] = df_method1_out['QUERY'].apply(obj.predict)
# df_method1_out.to_csv('output/method3_output_1.csv', index=False)

# # Run on output of first method
# df_method2_out = pd.read_csv('output/method2_output.csv')
# del df_method2_out['question_type']
# df_method2_out = df_method2_out[df_method2_out['is_question'] == 1]
# df_method2_out['question_type'] = df_method2_out['QUERY'].apply(obj.predict)
# df_method2_out.to_csv('output/method3_output_2.csv', index=False)






              precision    recall  f1-score   support

           0       0.88      1.00      0.93        21
           1       0.92      0.99      0.95        74
           2       0.96      0.94      0.95       199
           3       0.79      0.76      0.78        25
           4       1.00      0.97      0.98       126

    accuracy                           0.95       445
   macro avg       0.91      0.93      0.92       445
weighted avg       0.95      0.95      0.95       445

Accuracy is:  0.950561797752809


In [116]:
obj

<__main__.IsQuestionAdvanced at 0x7f5d92c4ec18>

In [117]:
testData = pd.read_csv('textDisplay.csv')
testData['question_type'] = testData['textDisplay'].apply(obj.predict)

In [118]:
testData

Unnamed: 0.1,Unnamed: 0,textDisplay,question_type
0,0,Glad to see your still uploading it's been a r...,affirmation
1,1,It's first time when YouTube is providing corr...,when
2,2,Please check your email!,unknown
3,3,Does AP mean Andhra Pradesh here.....??????\nI...,what
4,4,NOT TURKHIS!,unknown
...,...,...,...
514,514,Thanks from a chess fan!,unknown
515,515,What is that software that allows to write so ...,what
516,516,i think jacob mentions how me makes these in t...,unknown
517,517,Hello Patrick!\nThanks for the work you're doi...,what
