In [12]:
import nltk
import numpy as np
from itertools import groupby, chain
from collections import Counter, defaultdict
import spacy
spacy_nlp = spacy.load("en_core_web_sm")

class myRake(object):
    """Rapid Automatic Keyword Extraction Algorithm customized for 
    key-word extraction on video text w/ or w/o punctuation.
    
    RAKE algorithm based off of implementation from rake-nltk by Vishwas B Sharma
    https://github.com/csurfer/rake-nltk with changes to suit personal needs.
    """
    
    def __init__(self, stopwords = None, punctuations = None, num_words = 100000,
                 use_POS = True, known_words = []):
        # Initialize the stopwords and punctuations used to break text into phrases
        self.stopwords = stopwords
        self.punctuations = punctuations
        if self.stopwords == None:
            self.stopwords =  nltk.corpus.stopwords.words('english')
        if self.punctuations == None:
            self.punctuations = list('!"#$%&\'()*+,./:;<=>?@[-\\]^_`{|}~♪')
        # This is the set of words that determines breaks between phrases
        self.phrase_breaks = set(self.stopwords + self.punctuations)
        
        # This variable determines how many words long our key-words can be
        self.num_words = num_words
        
        # This variable lets us know if we want to use regular stopwords, or incorporate POS
        self.use_POS = use_POS
        # This variable stores a list of words that we want to have more impact in terms of score
        self.known_words = known_words
        
        # Variables to calcuate RAKE score
        self.frequencies = None
        self.degrees = None
        self.key_words = None
        
    def extract_keywords(self, text):
        # Situation where text contains sentences/punctuation
        if ", " in text:
            text_list = nltk.tokenize.sent_tokenize(text)
            phrase_tuples = self.key_word_candidates(text_list)
            self.RAKE_score(phrase_tuples)
            
        # Situation where text does not contain sentences/punctuation
        else:
            text_list = nltk.tokenize.sent_tokenize(text)
            phrase_tuples = self.key_word_candidates(text_list)
            self.RAKE_score(phrase_tuples)
            # TO DO: add some sort of method to split the text up into multiple sentences
            # Convert string to list of words. After x number of words, if the word and next word do not fall in
            # ['ADJ','DET','NOUN','NUM','PART','PROPN'] category, then add a . Then convert back to string
        
    def spacy_POS_phrase_breaks(self, text):
        """
        Inputs a string of text, find the Part of Speech for each word and add words that are not
        ['ADJ','DET','NOUN','NUM','PART','PROPN'] into a set of phrase break words to ignore.
        """
        # These are POS tags that we want in our keywords.
        # Try removing ADJ, DET 
        POS_we_want = ['ADJ','DET','NOUN','NUM','PART','PROPN']
        # Initialize the set with our existing phrase breaks
        temp_phrase_breaks = self.phrase_breaks
        
        # Use spacy to tag POS and then only keep words with the POS that we want
        doc = spacy_nlp(text)
        for token in doc:
            if token.pos_ not in POS_we_want:
                temp_phrase_breaks.add(token.text.lower())
        return temp_phrase_breaks
                
        
    def key_word_candidates(self, text_list):
        """
        Input a list of text segments and generates a set of possible key-word candidates.
        """
        candidates = set()
        for text in text_list:
            # Extract all words and punctuation from text into a list
            words = [word.lower() for word in nltk.wordpunct_tokenize(text)]
            
            if self.use_POS:
                # Create a temporary set of break words based on the Part of Speech
                temp_phrase_breaks = self.spacy_POS_phrase_breaks(text)
                # group words together using phrase breaks and a separator 
                phrase_groups = groupby(words, lambda word: word not in temp_phrase_breaks)
                
            else:
                # if we don't want to use POS, just use the stopwords + punct to break phrases
                phrase_groups = groupby(words, lambda word: word not in self.phrase_breaks)
                
            # Pull out the groups of words that do not include any of the phrase breaks   
            phrase_tuples = [tuple(group[1]) for group in phrase_groups if group[0] == True]
            # Add these groups to the output set
            candidates.update(phrase_tuples)
        # make sure the number of words in each of the tuples does not go over our limit
        return set(filter(lambda x: len(x) <= self.num_words, candidates))
        
    def RAKE_score(self, phrase_tuples):
        """
        Frequency part: chain up the phrase tuples and use the counter to tally up how often each word occurs.
                        Saves a dictionary of word:count pairs in self.frequencies
        Degree part: create a default dict to keep track of how many words each word co-occurs with in 
                     the phrase tuples. There is another way that keeps track of a co-occurence graph which
                     might be useful but I didn't implement for the sake of simplicity.
        Scoring part: Calculate the RAKE score for each phrase. The RAKE score for each  word is degree/frequency
                      and the RAKE score for each phrase is the sum of each word's RAKE score.
        """
        # Frequency part
        self.frequencies = Counter(chain.from_iterable(phrase_tuples))
        
        # Degree part
        self.degrees = defaultdict(int)
        for phrase in phrase_tuples:
            for word in phrase:
                self.degrees[word] += len(phrase)
        
        # Scoring part
        self.key_words = defaultdict(float)
        phrases = list()
        scores = list()
        for phrase in phrase_tuples:
            score = 0.0
            for word in phrase:
                score += float(self.degrees[word])/float(self.frequencies[word])
                # This is to give words that we know should be keywords a boost in score
                if word in self.known_words:
                    score += 10
            phrases.append(" ".join(phrase))
            scores.append(score)
        phrases = np.array(phrases)
        scores = np.array(scores)
        # Store the phrase:score pairs in descending order into self.key_words
        for i in np.argsort(scores)[::-1]:
            self.key_words[phrases[i]] = scores[i]
    
    def get_key_words(self):
        """
        get command to return a list of keywords ordered by their RAKE score
        """
        return list(self.key_words.keys())
    
    def get_key_words_scores(self):
        """
        get command to return a list of keywords and their RAKE scores
        """
        return [(key,self.key_words[key]) for key in self.key_words]

In [2]:
import pandas as pd
df = pd.read_csv("videoExamples.csv")
examples = list(df.captions)

In [13]:
r = myRake(use_POS=True)

In [10]:
r.extract_keywords(examples[3])

In [11]:
r.get_key_words()

['forgotten type bed',
 'dji osmo action',
 'shelves october 27th',
 'mellow piano music',
 'flatter color grade',
 'mobile app experience',
 '6k30 spherical video',
 'total battery killer',
 'two memory cards',
 'time warp feature',
 'widest angle gopro',
 '360 ° video',
 'friendly 360 camera',
 'guys next time',
 'megapixel panoramic photos',
 'good touch screen',
 '4k hero mode',
 'megapixel single lens',
 'max superview photos',
 '360 desktop app',
 'accessible 360 camera',
 'time warp',
 'max superview',
 'panoramic photo',
 'upbeat music',
 'widest field',
 'color correcting',
 'october 24th',
 'single lens',
 '360 camera',
 '360 degrees',
 '360 mode',
 'gopro app',
 'degree photos',
 'video director',
 'degree camera',
 'tough time',
 'time lapse',
 'camera whirring',
 'novelty camera',
 '360 footage',
 'full day',
 'great hardware',
 'hero mode',
 'fresh charge',
 'rubber buttons',
 'gopro fusion',
 'one x',
 'skin tones',
 'record button',
 'extra spares',
 'gopro max',
 'new 

In [58]:
r.get_key_words_scores()

[('dji osmo action', 9.0),
 ('forgotten type bed', 9.0),
 ('shelves october 27th', 8.5),
 ('mellow piano music', 8.5),
 ('flatter color grade', 8.5),
 ('mobile app experience', 8.25),
 ('6k30 spherical video', 8.166666666666666),
 ('total battery killer', 8.0),
 ('two memory cards', 8.0),
 ('time warp feature', 7.642857142857142),
 ('widest angle gopro', 7.5),
 ('360 ° video', 7.4393939393939394),
 ('friendly 360 camera', 7.415584415584416),
 ('guys next time', 7.142857142857142),
 ('megapixel panoramic photos', 7.033333333333334),
 ('4k hero mode', 7.0),
 ('good touch screen', 7.0),
 ('megapixel single lens', 6.833333333333334),
 ('max superview photos', 6.7),
 ('360 desktop app', 6.522727272727273),
 ('accessible 360 camera', 6.415584415584416),
 ('time warp', 4.642857142857142),
 ('color correcting', 4.5),
 ('max superview', 4.5),
 ('single lens', 4.5),
 ('widest field', 4.5),
 ('october 24th', 4.5),
 ('panoramic photo', 4.5),
 ('upbeat music', 4.5),
 ('360 camera', 4.41558441558441

In [14]:
r = myRake(use_POS=True, known_words=["battery","camera","screen"])

In [15]:
r.extract_keywords(examples[3])

In [16]:
r.get_key_words_scores()

[('total battery killer', 18.0),
 ('friendly 360 camera', 17.415584415584416),
 ('good touch screen', 17.0),
 ('accessible 360 camera', 16.415584415584416),
 ('360 camera', 14.415584415584416),
 ('degree camera', 14.142857142857142),
 ('novelty camera', 14.142857142857142),
 ('camera whirring', 14.142857142857142),
 ('battery performance', 14.0),
 ('different battery', 13.5),
 ('camera', 12.142857142857142),
 ('screen', 12.0),
 ('battery', 12.0),
 ('forgotten type bed', 9.0),
 ('dji osmo action', 9.0),
 ('flatter color grade', 8.5),
 ('shelves october 27th', 8.5),
 ('mellow piano music', 8.5),
 ('mobile app experience', 8.25),
 ('6k30 spherical video', 8.166666666666666),
 ('two memory cards', 8.0),
 ('time warp feature', 7.642857142857142),
 ('widest angle gopro', 7.5),
 ('360 ° video', 7.4393939393939394),
 ('guys next time', 7.142857142857142),
 ('megapixel panoramic photos', 7.033333333333334),
 ('4k hero mode', 7.0),
 ('megapixel single lens', 6.833333333333334),
 ('max superview 