In [235]:
import nltk
import numpy as np
from itertools import groupby, chain
from collections import Counter, defaultdict
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import re

class StatementExtracter(object):
    """
    As part of key word extraction and sentiment summarization, this code will be
    able to extract certain types of statements from a block of text.
    """
    
    def __init__(self, stopwords = None, punctuations = None):
        self.stopwords = stopwords
        self.punctuations = punctuations
        if self.stopwords == None:
            self.stopwords = []
        if self.punctuations == None:
            self.punctuations = list('!"#%&\'()*+,./:;<=>?@[\\]^_`{|}~♪')
        self.phrase_breaks = set(self.stopwords + self.punctuations)
        
    def replace_contraction(self, text):
        """
        Takes in text and replaces certain contractions
        """
        contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), 
                                (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                                (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'),
                                (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not'),
                                (r'they\'re', 'they are'), (r'They\'re', 'They are'), (r'it\'s', 'it is'), (r'It\'s', 'It is')]
        patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
        for (pattern, repl) in patterns:
            (text, count) = re.subn(pattern, repl, text)
        return text
    
    def add_periods(self, text):
        """
        Takes in a string of text with no punctuation, uses Spacy's method of breaking up sentences to 
        add periods to the end of these sentences. Returns text with periods.
        """
        doc = spacy_nlp(text)
        sentence_tokens = [sents.text for sents in doc.sents]
        new_sentence_tokens = []
        add_on = None
        for i in range(len(sentence_tokens)-1,-1,-1):
            if " " in sentence_tokens[i]:
                if add_on == None:
                    # Add a period to the end of the sentence
                    new_sentence_tokens.append(sentence_tokens[i]+'.')
                else:
                    # Add 1 word and period to the end of the sentence.
                    new_sentence_tokens.append(sentence_tokens[i]+' '+add_on+'.')
                    add_on = None
            else:
                # If there is a sentence which is just one word, add it to the end of the previous sentence.
                add_on = sentence_tokens[i]
        new_text = " ".join(new_sentence_tokens[::-1])
        return new_text
        
    def spacy_POS_phrase_breaks(self, doc, POS_we_want, tag_we_want):
        """
        Inputs a string of text, a list of POS, and a list of Spacy Tags that we want to keep in.
        This method parses the text and adds words that do not fall into POS and TAGS to phrase breaks list.
        """
        # Initialize the set with our existing phrase breaks
        temp_phrase_breaks = self.phrase_breaks.copy()

        for token in doc:
            if token.pos_ not in POS_we_want and token.tag_ not in tag_we_want:
                temp_phrase_breaks.add(token.text.lower())
        return temp_phrase_breaks
    
    def visualize_POS(self, text, punctuation = False):
        """Visualize the POS of each word in the text"""
        if punctuation == False:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
        else:
            text = self.replace_contraction(text)
        doc = spacy_nlp(text)
        for token in doc:
            print("{0}/{1}/{2} <--{3}-- {4}/{5}/{6}".format(
                    token.text,token.pos_,token.tag_,token.dep_,token.head.text,token.head.pos_,token.head.tag_))        
    
    def is_statements(self, text):
        # Situation where text contains sentences/punctuation
        if ", " in text:
            text = self.replace_contraction(text)
            self.statementExtraction(text, withPunctuation = True)
        # Situation where text does not contain sentences/punctuation
        else:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
            self.statementExtraction(text, withPunctuation = False)
        
    def statementExtraction(self, text, withPunctuation = False):
        doc = spacy_nlp(text)
        for token in doc:
            # If the token is "is" or "are", we want to look at the subtree
            if token.text in ['is','are']:
                # Pull the subtree as token objects and as pure text
                subtree = [i for i in token.subtree]

                # Split the subtree up into left and right groups
                left_subtree = [word for word in subtree if word.i < token.i]
                right_subtree = [word for word in subtree if word.i > token.i]

                if withPunctuation == True:
                    # Create a temporary set of break words based on the Part of Speech
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PUNCT','ADP','PRON','AUX','SYM','SCONJ']
                    left_tags = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','PUNCT','ADP','AUX','SYM','SCONJ','CCONJ','PRON']
                    right_tags = ['VBG','VBZ','VBP','VB','VBD','VBN']
                if withPunctuation == False:
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PRON']
                    left_tags = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','ADP','CCONJ','SCONJ']
                    right_tags = ['VB','VBG']
                left_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, left_POS, left_tags)
                right_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, right_POS, right_tags)

                # group words together using phrase breaks and a separator 
                left_phrase_groups = groupby(left_subtree, lambda word: word.text.lower() not in left_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                left_phrase_tuples = [tuple(group[1]) for group in left_phrase_groups if group[0] == True]

                # group words together using phrase breaks and a separator 
                right_phrase_groups = groupby(right_subtree, lambda word: word.text.lower() not in right_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                right_phrase_tuples = [tuple(group[1]) for group in right_phrase_groups if group[0] == True]

                subject = []
                description = []
                
                # For the subjects on the left side of "is/are"
                subject_word = None
                for tuple_ in left_phrase_tuples:
                    for word in tuple_:
                        # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                        if word.dep_ in ['nsubj', 'expl']:
                            # check if we already have a good candidate for subject
                            if len(subject) < 1:
                                subject_word = word
                                subject = tuple_
                                break
                            elif subject_word.pos_ in ['NOUN'] and word.pos_ not in ['NOUN']:
                                pass
                            # if the existing candidate is not a noun, then we can update it.
                            else:
                                subject_word = word
                                subject = tuple_
                                break
                # Look at right text, if it includes adj, keep it + or just take the first phrase in the tuple.
                if len(right_phrase_tuples) > 0:
                    description = right_phrase_tuples[0]
                    
                # Create the statements
#                 print(" ".join([word.text for word in subtree]))
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in left_subtree])
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in right_subtree])
#                 print('\n')
                if len(subject) > 0 and len(description) > 0 and len(subtree) < 30:
                    output = " ".join([word.text for word in subject]) + " " + token.text + " " + " ".join([word.text for word in description])
                    print(output)

In [115]:
import pandas as pd
df = pd.read_csv("videoExamples.csv")
examples = list(df.captions)

In [236]:
se = StatementExtracter()

In [239]:
se.is_statements(examples[2])

it is still a lot of money for a Chromebook
the pixel book is a nice device
it is a phone or pedestrian computer than
it is thicker than that device still
it is glass an aluminum lid
it is good to see Google 's more playful side on this otherwise dull laptop and while the goal
it is much heavier than the go for example
such there are some other notable changes to the go
it is not a convertible so tablet mode
it is just not nearly as gorgeous to look at as those last two computers 1080p on a 13.3 inch screen
the good news is that the go as much slimmer bezels around the screen than the original pixel book
the go is almost as good
they are pretty quiet Google about the whole thing right here
the trackpad is just fine
it is not bad
which is frustrating
Android apps are not usually worth using if
it is a worthwhile upgrade for
which is a definite improvement over the first pixel book
the pixel book go is a much easier device to evaluate than either the first pixel book or the pixel
it is a

In [184]:
se.visualize_POS(examples[2])

 /SPACE/_SP <---- Google/PROPN/NNP
Google/PROPN/NNP <--nsubj-- building/VERB/VBG
has/AUX/VBZ <--aux-- building/VERB/VBG
been/AUX/VBN <--aux-- building/VERB/VBG
building/VERB/VBG <--ROOT-- building/VERB/VBG
its/PRON/PRP$ <--poss-- Chromebooks/PROPN/NNP
own/ADJ/JJ <--amod-- Chromebooks/PROPN/NNP
Chromebooks/PROPN/NNP <--dobj-- building/VERB/VBG
for/ADP/IN <--prep-- building/VERB/VBG
a/DET/DT <--det-- while/NOUN/NN
while/NOUN/NN <--pobj-- for/ADP/IN
now/ADV/RB <--advmod-- while/NOUN/NN
./PUNCT/. <--punct-- building/VERB/VBG
first/ADV/RB <--advmod-- were/AUX/VBD
there/PRON/EX <--expl-- were/AUX/VBD
were/AUX/VBD <--ROOT-- were/AUX/VBD
two/NUM/CD <--nummod-- iterations/NOUN/NNS
iterations/NOUN/NNS <--attr-- were/AUX/VBD
of/ADP/IN <--prep-- iterations/NOUN/NNS
the/DET/DT <--det-- pixel/NOUN/NN
Chromebook/PROPN/NNP <--compound-- pixel/NOUN/NN
pixel/NOUN/NN <--pobj-- of/ADP/IN
and/CCONJ/CC <--cc-- were/AUX/VBD
then/ADV/RB <--advmod-- was/AUX/VBD
there/PRON/EX <--expl-- was/AUX/VBD
was/AUX/VBD <

In [205]:
examples[0]

" [Music] hey what is up guys mkbhd here this is the pixel for excel this is one of the phones I was looking forward to the most this entire year for 2019 for a couple of reasons and so now I've been using it daily since its unveiling this is my honest review so let's just start with the wait looks I honestly think it's a pretty decent looking phone it's low-key really clean there's almost no markings along the back just the Google G at the bottom matte black aluminum rails as part of the design all the way around the phone speaker slots at the bottom lined up with the USB type-c port and the colored power button of course on every version and the whole back of the phone on two of the colors is this soft touch matte finish which does a great job of not showing fingerprints it doesn't seem to scratch very easily at all so of the three colors available I'm gonna say this white this Panda version is the best one that power buttons pretty sweet then I'm gonna say oh so orange is in second 