In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Data Preprocessing
def preprocessing(text):
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text)
              for word in nltk.word_tokenize(sent)]

    print( "- tokenize into words -" )
    print( tokens )
    print()
    
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    print( "- remove stopwords -" )
    print( tokens )
    print()
    
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    print( "- remove words less than three letters -" )
    print( tokens )
    print()
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]

    print( "- lower capitalization -" )
    print( tokens )
    print()
    
    # lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    print( "- lemmatization -" )
    print( tokens )
    print()

    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]

    print( "- lemmatization/verb -" )
    print( tokens )
    print()

    # stemming
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(word) for word in tokens ]

    print( "- stemming -" )
    print(tokens)
    print()
    
    
    return tokens

# Transcript data input
data = "this video is going to explain how to find inflection points on a graph. So let's suppose we have a graph. So here's a nice graph. This is the function f of T independent variable T and were asked to find What are the inflection points? Well, remember, inflection points are places where the con cavity changes, so it's a good idea to remember what Con cavity is. So for con cavity, we have con cave up where the graph is turning up or con cave down where the graph is turning down. So in order to figure out where the con cavity changes, we need to figure out where what the con cavity is and where it's con cave up and where it's Khan came down. So let's start from left to right. We started on the left hand side. We look at this function. Looks like this function is con cave up. So we're turning upwards now at some point, and that's the inflection point. We start to turn downwards so you can see along here getting red. I'm turning downwards, whereas before, up until the red part, I was turning upwards. Okay, Now the changes where the inflection point is that change Looks like it's about right here. So what you have to imagine is you sort of turn your screen sideways and you're a race car that's racing down this road and you're turning left. You keep turning left. That means you're Khan gave up and eventually you turn your race car to the right In your race cars, you're traveling from left to right. And as we turn to the right here, we're going con cave down and then we can Are we continue along our highway or highway takes a turn to the left. That means we're con cave up until we get to while we keep turning until we get to about right here where we start turning to the right So this red is indicating returning to the right. We keep turning right as we move down the road until maybe about right here where we start turning left again And so we're We're along this red segment of the road where con cave down We're turning down or if you like, we're turning to the right as we go along this particular road from left to right and then we start turning up again, turning to the left until maybe somewhere around here where that we have a nuke turn in the road and the road turns to the right. So the white here is turning to the left or con cave up. We have an inflection point at this green point here. That's where the con cavity changes where we change the direction into which were turning. We're no longer turning up. We're turning down con cave down. Then we have another inflection point where now we're turning up. We have an inflection point here because now we're going to turn down. Then we start turning up again. Then we turned down So these green points of the places where we change the direction in which returning and the white along the white intervals were turning up along the red intervals were turning down. So these, in fact, are the inflection points. These are the places where we change the direction in which returning. So in this case it looks like there are altogether five inflection points. Those are the places where the can cavity changes"

# preprocessing Transcript data
result = preprocessing(data)

# print result
print("-result-")
print (result)

- tokenize into words -
['this', 'video', 'is', 'going', 'to', 'explain', 'how', 'to', 'find', 'inflection', 'points', 'on', 'a', 'graph', '.', 'So', 'let', "'s", 'suppose', 'we', 'have', 'a', 'graph', '.', 'So', 'here', "'s", 'a', 'nice', 'graph', '.', 'This', 'is', 'the', 'function', 'f', 'of', 'T', 'independent', 'variable', 'T', 'and', 'were', 'asked', 'to', 'find', 'What', 'are', 'the', 'inflection', 'points', '?', 'Well', ',', 'remember', ',', 'inflection', 'points', 'are', 'places', 'where', 'the', 'con', 'cavity', 'changes', ',', 'so', 'it', "'s", 'a', 'good', 'idea', 'to', 'remember', 'what', 'Con', 'cavity', 'is', '.', 'So', 'for', 'con', 'cavity', ',', 'we', 'have', 'con', 'cave', 'up', 'where', 'the', 'graph', 'is', 'turning', 'up', 'or', 'con', 'cave', 'down', 'where', 'the', 'graph', 'is', 'turning', 'down', '.', 'So', 'in', 'order', 'to', 'figure', 'out', 'where', 'the', 'con', 'cavity', 'changes', ',', 'we', 'need', 'to', 'figure', 'out', 'where', 'what', 'the', 'con', 

In [11]:
print()
print("word_list")

word_list = list(set(result))
word_len = len(word_list)

print(word_list)

print("word_len")
print(word_len)

word_cnt = {}

for i in result:
    if(i in word_list):
        if(i not in word_cnt.keys()):
            word_cnt[i] = 1
        else:
            word_cnt[i] += 1
    else:
        continue

print(word_cnt)


word_list
['keep', 'road', 'side', 'explain', 'longer', 'mean', 'let', 'travel', 'like', 'place', 'sideway', 'indic', 'good', 'race', 'graph', 'idea', 'khan', 'go', 'along', 'nice', 'return', 'take', 'okay', 'con', 'get', 'look', 'red', 'well', 'upward', 'independ', 'fact', 'ask', 'and', 'variabl', 'thi', 'that', 'chang', 'screen', 'sort', 'caviti', 'car', 'interv', 'altogeth', 'highway', 'leav', 'somewher', 'see', 'order', 'right', 'five', 'particular', 'downward', 'direct', 'these', 'hand', 'wherea', 'what', 'give', 'green', 'rememb', 'turn', 'find', 'come', 'white', 'need', 'anoth', 'cave', 'inflect', 'you', 'mayb', 'segment', 'part', 'now', 'eventu', 'nuke', 'suppos', 'be', 'move', 'those', 'around', 'video', 'point', 'then', 'case', 'start', 'function', "'re", 'imagin', 'figur', 'continu']
word_len
90
{'video': 1, 'go': 3, 'explain': 1, 'find': 2, 'inflect': 10, 'point': 13, 'graph': 5, 'let': 2, 'suppos': 1, 'nice': 1, 'thi': 1, 'function': 3, 'independ': 1, 'variabl': 1, 'ask':

In [20]:
sort_word_cnt = sorted(word_cnt, key=lambda x:-word_cnt[x])
print(sort_word_cnt)

['turn', 'con', "'re", 'point', 'right', 'inflect', 'leav', 'chang', 'cave', 'caviti', 'start', 'along', 'road', 'graph', 'red', 'place', 'look', 'like', 'race', 'go', 'function', 'get', 'car', 'keep', 'that', 'return', 'white', 'direct', 'then', 'find', 'let', 'rememb', 'figur', 'khan', 'upward', 'downward', 'mean', 'and', 'highway', 'mayb', 'green', 'interv', 'video', 'explain', 'suppos', 'nice', 'thi', 'independ', 'variabl', 'ask', 'what', 'well', 'good', 'idea', 'order', 'need', 'come', 'hand', 'side', 'see', 'wherea', 'part', 'okay', 'now', 'imagin', 'sort', 'screen', 'sideway', 'you', 'give', 'eventu', 'travel', 'be', 'continu', 'take', 'indic', 'move', 'segment', 'particular', 'somewher', 'around', 'nuke', 'longer', 'anoth', 'fact', 'these', 'case', 'altogeth', 'five', 'those']


In [22]:
print(sort_word_cnt[:10])

['turn', 'con', "'re", 'point', 'right', 'inflect', 'leav', 'chang', 'cave', 'caviti']
