In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [9]:
import nltk
import numpy as np
from itertools import groupby, chain
from collections import Counter, defaultdict
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import re
#import boto3

class StatementExtracter(object):
    """
    As part of key word extraction and sentiment summarization, this code will be
    able to extract certain types of statements from a block of text.
    """
    
    def __init__(self, stopwords = None, punctuations = None):
        self.stopwords = stopwords
        self.punctuations = punctuations
        if self.stopwords == None:
            self.stopwords = []
        if self.punctuations == None:
            self.punctuations = list('!"#%&\'()*+,./:;<=>?@[\\]^_`{|}~♪')
        self.phrase_breaks = set(self.stopwords + self.punctuations)
        
    def replace_contraction(self, text):
        """
        Takes in text and replaces certain contractions
        """
        contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), 
                                (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                                (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'),
                                (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not'),
                                (r'they\'re', 'they are'), (r'They\'re', 'They are'), (r'it\'s', 'it is'), (r'It\'s', 'It is')]
        patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
        for (pattern, repl) in patterns:
            (text, count) = re.subn(pattern, repl, text)
        return text
    
    def add_periods(self, text):
        """
        Takes in a string of text with no punctuation, uses Spacy's method of breaking up sentences to 
        add periods to the end of these sentences. Returns text with periods.
        """
        doc = spacy_nlp(text)
        sentence_tokens = [sents.text for sents in doc.sents]
        new_sentence_tokens = []
        add_on = None
        for i in range(len(sentence_tokens)-1,-1,-1):
            if " " in sentence_tokens[i]:
                if add_on == None:
                    # Add a period to the end of the sentence
                    new_sentence_tokens.append(sentence_tokens[i]+'.')
                else:
                    # Add 1 word and period to the end of the sentence.
                    new_sentence_tokens.append(sentence_tokens[i]+' '+add_on+'.')
                    add_on = None
            else:
                # If there is a sentence which is just one word, add it to the end of the previous sentence.
                add_on = sentence_tokens[i]
        new_text = " ".join(new_sentence_tokens[::-1])
        return new_text
        
    def spacy_POS_phrase_breaks(self, doc, POS_we_want, tag_we_want, dep_we_want):
        """
        Inputs a string of text, a list of POS, and a list of Spacy Tags that we want to keep in.
        This method parses the text and adds words that do not fall into POS and TAGS to phrase breaks list.
        """
        # Initialize the set with our existing phrase breaks
        temp_phrase_breaks = self.phrase_breaks.copy()

        for token in doc:
            if token.pos_ not in POS_we_want and token.tag_ not in tag_we_want and token.dep_ not in dep_we_want:
                temp_phrase_breaks.add(token.text.lower())
        return temp_phrase_breaks
    
    def visualize_POS(self, text, punctuation = False):
        """Visualize the POS of each word in the text"""
        if punctuation == False:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
        else:
            text = self.replace_contraction(text)
        doc = spacy_nlp(text)
        for token in doc:
            print("{0}/{1}/{2} <--{3}-- {4}/{5}/{6}".format(
                    token.text,token.pos_,token.tag_,token.dep_,token.head.text,token.head.pos_,token.head.tag_))        
    
    def is_statements(self, text):
        # Situation where text contains sentences/punctuation
        if ", " in text:
            text = self.replace_contraction(text)
            return self.statementExtraction(text, withPunctuation = True)
        # Situation where text does not contain sentences/punctuation
        else:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
            return self.statementExtraction(text, withPunctuation = False)
        
    def statementExtraction(self, text, withPunctuation = False):
        doc = spacy_nlp(text)
        statements = set()
        for token in doc:
            # If the token is "is" or "are", we want to look at the subtree
            if token.text in ['is','are']:
                # Pull the subtree as token objects and as pure text
                subtree = [i for i in token.subtree]

                # Split the subtree up into left and right groups
                left_subtree = [word for word in subtree if word.i < token.i]
                right_subtree = [word for word in subtree if word.i > token.i]

                if withPunctuation == True:
                    # Create a temporary set of break words based on the Part of Speech
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PUNCT','ADP','PRON','AUX','SYM','SCONJ']
                    left_tags = []
                    left_dep = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','PUNCT','ADP','AUX','SYM','SCONJ','CCONJ','PRON']
                    right_tags = ['VBG','VBZ','VBP','VB','VBD','VBN']
                    right_dep = []
                if withPunctuation == False:
                    left_POS = ['NOUN']
                    left_tags = ['NNP','PRP','HYPH','JJ']
                    left_dep = ['advmod','amod','neg', 'nsubj']
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','ADP']
                    right_tags = ['VB','VBG','HYPH','VBN','UH']
                    right_dep = ['intj','prep','neg','pobj']
                left_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, left_POS, left_tags, left_dep)
                right_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, right_POS, right_tags, right_dep)

                # group words together using phrase breaks and a separator 
                left_phrase_groups = groupby(left_subtree, lambda word: word.text.lower() not in left_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                left_phrase_tuples = [tuple(group[1]) for group in left_phrase_groups if group[0] == True]

                # group words together using phrase breaks and a separator 
                right_phrase_groups = groupby(right_subtree, lambda word: word.text.lower() not in right_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                right_phrase_tuples = [tuple(group[1]) for group in right_phrase_groups if group[0] == True]

                subject = []
                description = []
                
                if withPunctuation == True:
                    # For the subjects on the left side of "is/are"
                    subject_word = None
                    for tuple_ in left_phrase_tuples:
                        for word in tuple_:
                            # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                            if word.dep_ in ['nsubj', 'expl']:
                                # check if we already have a good candidate for subject
                                if len(subject) < 1:
                                    subject_word = word
                                    subject = tuple_
                                    break
                                elif subject_word.pos_ in ['NOUN'] and word.pos_ not in ['NOUN']:
                                    pass
                                # if the existing candidate is not a noun, then we can update it.
                                else:
                                    subject_word = word
                                    subject = tuple_
                                    break
                    # Look at right text, if it includes adj, keep it + or just take the first phrase in the tuple.
                    if len(right_phrase_tuples) > 0:
                        description = right_phrase_tuples[0]
                if withPunctuation == False:
                    # For the subjects on the left side of "is/are"
                    subject_word = None
                    for tuple_ in left_phrase_tuples:
                        for word in tuple_:
                            # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                            if word.dep_ in ['nsubj', 'expl'] and word.pos_ in ['NOUN','DET','PRON']:
                                # check if we already have a good candidate for subject
                                if len(subject) < 1:
                                    subject_word = word
                                    subject = tuple_
                                    break
                                elif subject_word.pos_ in ['NOUN','DET'] and word.pos_ not in ['NOUN','DET']:
                                    pass
                                # if the existing candidate is not a noun, then we can update it.
                                else:
                                    subject_word = word
                                    subject = tuple_
                                    break
                    # Look at right text to get a description.
                    if len(right_phrase_tuples) > 0:
                        for word in right_phrase_tuples[0]:
                            if word.pos_ == "ADJ":
                                description = right_phrase_tuples[0]

                    
                # Create the statements
#                 print(" ".join([word.text for word in subtree]))
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in left_subtree])
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in right_subtree])
#                 print('\n')
                print(subtree)
                if len(subject) > 0 and len(description) > 0 and len(subtree) < 30:
                    output = " ".join([word.text for word in subject]) + " " + token.text + " " + " ".join([word.text for word in description])
                    print(output+"\n")
                    statements.add(output)
        return statements

In [10]:
import pandas as pd
df = pd.read_csv("videoExamples.csv")
examples = list(df.captions)

In [11]:
se = StatementExtracter()

In [12]:
statements = se.is_statements(examples[5])

[ , -, Alright, ,, it, is, iPhone, time, .]
it is iPhone time

[the, iPhone, 11, is, the, phone, most, people, in, the, iOS, ecosystem, should, get, if, they, are, upgrading]
the iPhone 11 is the phone most people in the iOS ecosystem should get if they are upgrading

[are]
[But, this, ,, this, is, the, Pro, review, .]
this is the Pro review

[They, are, basically, just, nice, updates, to, the, iPhone, XS, ,, but, I, also, think, it, is, a, waste, of, time, to, argue, about, names, .]
They are basically just nice updates to the iPhone XS

[it, is, a, waste, of, time, to, argue, about, names]
it is a waste of time to argue about names

[iPhone, 11, Pro, Max, is, a, bad, name, .]
iPhone 11 Pro Max is a bad name

[what, ,, it, is, a, great, phone]
it is a great phone

[This, is, the, best, camera, I, have, ever, seen, on, a, phone, .]
This is the best camera I have ever seen on a phone

[It, is, a, little, heavier, and, thicker, ,, but, unless, you, 're, comparing, them, directly, ,, you,

In [90]:
se.visualize_POS(examples[1])

 /SPACE/_SP <---- it/PRON/PRP
it/PRON/PRP <--nsubj-- is/AUX/VBZ
is/AUX/VBZ <--ROOT-- is/AUX/VBZ
Google/PROPN/NNP <--compound-- pixel/NOUN/NN
pixel/NOUN/NN <--attr-- is/AUX/VBZ
for/ADP/IN <--prep-- pixel/NOUN/NN
day/NOUN/NN <--pobj-- for/ADP/IN
and/CCONJ/CC <--cc-- is/AUX/VBZ
of/ADV/RB <--prep-- know/VERB/VB
course/NOUN/NN <--pobj-- of/ADV/RB
as/SCONJ/IN <--mark-- know/VERB/VB
you/PRON/PRP <--nsubj-- know/VERB/VB
should/AUX/MD <--aux-- know/VERB/VB
know/VERB/VB <--conj-- is/AUX/VBZ
we/PRON/PRP <--nsubj-- got/VERB/VBN
have/AUX/VBP <--aux-- got/VERB/VBN
got/VERB/VBN <--ccomp-- know/VERB/VB
every/DET/DT <--det-- version/NOUN/NN
single/ADJ/JJ <--amod-- version/NOUN/NN
version/NOUN/NN <--dobj-- got/VERB/VBN
sitting/VERB/VBG <--acl-- version/NOUN/NN
in/ADP/IN <--prep-- sitting/VERB/VBG
front/NOUN/NN <--pobj-- in/ADP/IN
of/ADP/IN <--prep-- front/NOUN/NN
me/PRON/PRP <--pobj-- of/ADP/IN
./PUNCT/. <--punct-- know/VERB/VB
this/DET/DT <--det-- version/NOUN/NN
every/DET/DT <--det-- version/NOUN/NN
s

In [5]:
def sentenceSentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
    sentiment_json = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    sent = sentiment_json['Sentiment']
    sent_pos = sentiment_json['SentimentScore']['Positive']
    sent_neg = sentiment_json['SentimentScore']['Negative']
    sent_neu = sentiment_json['SentimentScore']['Neutral']
    sent_mix = sentiment_json['SentimentScore']['Mixed']

    return sent, sent_pos, sent_neg, sent_neu, sent_mix

In [8]:
sentenceSentiment('the pixel book is a nice device')

('POSITIVE',
 0.9993759989738464,
 9.10422095330432e-05,
 0.0005312334396876395,
 1.7470395050622756e-06)

In [31]:
for statement in statements:
    print(statement)
    print(sentenceSentiment(statement))

the pixel book is a nice device
('POSITIVE', 0.9993759989738464, 9.10422095330432e-05, 0.0005312334396876395, 1.7470395050622756e-06)
thus far it is a basic looking clamshell laptop with a 13.3 inch
('NEUTRAL', 0.23179548978805542, 0.25214096903800964, 0.5160171985626221, 4.637035453924909e-05)
it is thicker
('NEGATIVE', 0.18020671606063843, 0.4425104260444641, 0.37719425559043884, 8.854686166159809e-05)
it is much heavier than the go for example
('NEGATIVE', 0.06437221169471741, 0.8682929873466492, 0.06680403649806976, 0.0005307704559527338)
it is not a convertible
('NEGATIVE', 0.004604209680110216, 0.8924250602722168, 0.10290084034204483, 6.981368642300367e-05)
the good news is the go as much slimmer
('POSITIVE', 0.9333986639976501, 0.007871556095778942, 0.05870736017823219, 2.2384479962056503e-05)
the go is almost as good
('POSITIVE', 0.9849099516868591, 0.001699002692475915, 0.013126572594046593, 0.0002644038468133658)
they are pretty quiet Google about the whole thing
('POSITIVE',

In [25]:
statements

{'bad the little guy is 2,800 milliamp hours at the big one 3700 both 18 watt fast charging',
 'face unlock is enough',
 'it is a matte finish',
 'it is a real mission of',
 'it is a very practical utility more than',
 'it is also important to know',
 'it is glossy so on the black version',
 'it is hot off the presses so this video',
 'it is more volume',
 'it is not a glossy finish',
 'it is possibly not even final software number two',
 'just it is an unboxing video',
 'so it is a panda effect black',
 'so it is the for excel in the three different colors',
 'then the entire front is black',
 'then this is the aforementioned power brick with the quick charge capability',
 'they are cool with having a forehead a chin on the device not overwhelmingly slim bezels',
 'this is a fresh device',
 'this is a very simple look to',
 'this is fresh',
 'this is just a first - look type of video',
 'this is the small one',
 'this software is early'}