In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [14]:
import nltk
import numpy as np
from itertools import groupby, chain
from collections import Counter, defaultdict
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import re
#import boto3

class StatementExtracter(object):
    """
    As part of key word extraction and sentiment summarization, this code will be
    able to extract certain types of statements from a block of text.
    """
    
    def __init__(self, stopwords = None, punctuations = None):
        self.stopwords = stopwords
        self.punctuations = punctuations
        if self.stopwords == None:
            self.stopwords = []
        if self.punctuations == None:
            self.punctuations = list('!"#%&\'()*+,./:;<=>?@[\\]^_`{|}~♪')
        self.phrase_breaks = set(self.stopwords + self.punctuations)
        
    def replace_contraction(self, text):
        """
        Takes in text and replaces certain contractions
        """
        contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), 
                                (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                                (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'),
                                (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not'),
                                (r'they\'re', 'they are'), (r'They\'re', 'They are'), (r'it\'s', 'it is'), (r'It\'s', 'It is')]
        patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
        for (pattern, repl) in patterns:
            (text, count) = re.subn(pattern, repl, text)
        return text
    
    def add_periods(self, text):
        """
        Takes in a string of text with no punctuation, uses Spacy's method of breaking up sentences to 
        add periods to the end of these sentences. Returns text with periods.
        """
        doc = spacy_nlp(text)
        sentence_tokens = [sents.text for sents in doc.sents]
        new_sentence_tokens = []
        add_on = None
        for i in range(len(sentence_tokens)-1,-1,-1):
            if " " in sentence_tokens[i]:
                if add_on == None:
                    # Add a period to the end of the sentence
                    new_sentence_tokens.append(sentence_tokens[i]+'.')
                else:
                    # Add 1 word and period to the end of the sentence.
                    new_sentence_tokens.append(sentence_tokens[i]+' '+add_on+'.')
                    add_on = None
            else:
                # If there is a sentence which is just one word, add it to the end of the previous sentence.
                add_on = sentence_tokens[i]
        new_text = " ".join(new_sentence_tokens[::-1])
        return new_text
        
    def spacy_POS_phrase_breaks(self, doc, POS_we_want, tag_we_want, dep_we_want):
        """
        Inputs a string of text, a list of POS, and a list of Spacy Tags that we want to keep in.
        This method parses the text and adds words that do not fall into POS and TAGS to phrase breaks list.
        """
        # Initialize the set with our existing phrase breaks
        temp_phrase_breaks = self.phrase_breaks.copy()

        for token in doc:
            if token.pos_ not in POS_we_want and token.tag_ not in tag_we_want and token.dep_ not in dep_we_want:
                temp_phrase_breaks.add(token.text.lower())
        return temp_phrase_breaks
    
    def visualize_POS(self, text, punctuation = False):
        """Visualize the POS of each word in the text"""
        if punctuation == False:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
        else:
            text = self.replace_contraction(text)
        doc = spacy_nlp(text)
        for token in doc:
            print("{0}/{1}/{2} <--{3}-- {4}/{5}/{6}".format(
                    token.text,token.pos_,token.tag_,token.dep_,token.head.text,token.head.pos_,token.head.tag_))        
    
    def is_statements(self, text):
        # Situation where text contains sentences/punctuation
        if ", " in text:
            text = self.replace_contraction(text)
            return self.statementExtraction(text, withPunctuation = True)
        # Situation where text does not contain sentences/punctuation
        else:
            text = self.add_periods(text)
            text = self.replace_contraction(text)
            return self.statementExtraction(text, withPunctuation = False)
        
    def statementExtraction(self, text, withPunctuation = False):
        doc = spacy_nlp(text)
        statements = list()
        for token in doc:
            # If the token is "is" or "are", we want to look at the subtree
            if token.text in ['is','are']:
                # Pull the subtree as token objects and as pure text
                subtree = [i for i in token.subtree]

                # Split the subtree up into left and right groups
                left_subtree = [word for word in subtree if word.i < token.i]
                right_subtree = [word for word in subtree if word.i > token.i]

                if withPunctuation == True:
                    # Create a temporary set of break words based on the Part of Speech
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PUNCT','ADP','PRON','AUX','SYM','SCONJ']
                    left_tags = []
                    left_dep = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','PUNCT','ADP','AUX','SYM','SCONJ','CCONJ','PRON']
                    right_tags = ['VBG','VBZ','VBP','VB','VBD','VBN']
                    right_dep = []
                if withPunctuation == False:
                    left_POS = ['NOUN']
                    left_tags = ['NNP','PRP','HYPH','JJ']
                    left_dep = ['advmod','amod','neg', 'nsubj']
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','ADP']
                    right_tags = ['VB','VBG','HYPH','VBN','UH']
                    right_dep = ['intj','prep','neg','pobj']
                left_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, left_POS, left_tags, left_dep)
                right_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, right_POS, right_tags, right_dep)

                # group words together using phrase breaks and a separator 
                left_phrase_groups = groupby(left_subtree, lambda word: word.text.lower() not in left_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                left_phrase_tuples = [tuple(group[1]) for group in left_phrase_groups if group[0] == True]

                # group words together using phrase breaks and a separator 
                right_phrase_groups = groupby(right_subtree, lambda word: word.text.lower() not in right_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                right_phrase_tuples = [tuple(group[1]) for group in right_phrase_groups if group[0] == True]

                subject = []
                description = []
                
                if withPunctuation == True:
                    # For the subjects on the left side of "is/are"
                    subject_word = None
                    for tuple_ in left_phrase_tuples:
                        for word in tuple_:
                            # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                            if word.dep_ in ['nsubj', 'expl']:
                                # check if we already have a good candidate for subject
                                if len(subject) < 1:
                                    subject_word = word
                                    subject = tuple_
                                    break
                                elif subject_word.pos_ in ['NOUN'] and word.pos_ not in ['NOUN']:
                                    pass
                                # if the existing candidate is not a noun, then we can update it.
                                else:
                                    subject_word = word
                                    subject = tuple_
                                    break
                    # Look at right text, if it includes adj, keep it + or just take the first phrase in the tuple.
                    if len(right_phrase_tuples) > 0:
                        description = right_phrase_tuples[0]
                if withPunctuation == False:
                    # For the subjects on the left side of "is/are"
                    subject_word = None
                    for tuple_ in left_phrase_tuples:
                        for word in tuple_:
                            # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                            if word.dep_ in ['nsubj', 'expl'] and word.pos_ in ['NOUN','DET','PRON']:
                                # check if we already have a good candidate for subject
                                if len(subject) < 1:
                                    subject_word = word
                                    subject = tuple_
                                    break
                                elif subject_word.pos_ in ['NOUN','DET'] and word.pos_ not in ['NOUN','DET']:
                                    pass
                                # if the existing candidate is not a noun, then we can update it.
                                else:
                                    subject_word = word
                                    subject = tuple_
                                    break
                    # Look at right text to get a description.
                    if len(right_phrase_tuples) > 0  and len(subtree) < 30:
                        for word in right_phrase_tuples[0]:
                            if word.pos_ == "ADJ":
                                description = right_phrase_tuples[0]

                    
                # Create the statements
#                 print(" ".join([word.text for word in subtree]))
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in left_subtree])
#                 print([(word.text,word.pos_,word.tag_,word.dep_) for word in right_subtree])
#                 print('\n')
                print(" ".join([word.text for word in subtree]))
                if len(subject) > 0 and len(description) > 0:
                    output = " ".join([word.text for word in subject]) + " " + token.text + " " + " ".join([word.text for word in description])
                    print(output+"\n")
                    if output not in statements:
                        statements.append(output)
        return statements

In [5]:
import pandas as pd
df = pd.read_csv("ExampleTexts/videoExamples.csv")
examples = list(df.captions)

In [6]:
se = StatementExtracter()

In [7]:
statements = se.is_statements(examples[0])

what is up guys mkbhd here
what is up guys mkbhd here this is the pixel for excel .
this is one of the phones I was looking forward to the most this entire year for 2019 for a couple of reasons .
this is my honest review .
this is honest review

it is a pretty decent looking phone
it is a pretty decent looking phone

it is low - key really clean .
it is low - key really clean

just the Google G at the bottom matte black aluminum rails as part of the design all the way around the phone speaker slots at the bottom lined up with the USB type - c port and the colored power button of course on every version and the whole back of the phone on two of the colors is this soft touch matte finish which does a great job of not showing fingerprints it does not seem to scratch very easily at all so of the three colors .
this Panda version is the best one that power buttons pretty sweet .
Panda version is the best one

oh so orange is in second place .
but at least it is matte and then that jet - bla

In [4]:
se.visualize_POS(examples[0])

NameError: name 'se' is not defined

In [5]:
def sentenceSentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
    sentiment_json = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    sent = sentiment_json['Sentiment']
    sent_pos = sentiment_json['SentimentScore']['Positive']
    sent_neg = sentiment_json['SentimentScore']['Negative']
    sent_neu = sentiment_json['SentimentScore']['Neutral']
    sent_mix = sentiment_json['SentimentScore']['Mixed']

    return sent, sent_pos, sent_neg, sent_neu, sent_mix

In [9]:
for statement in statements:
    sentiment = sentenceSentiment(statement)
    if sentiment[3] > 0.5:
        print(statement)
        print(sentiment)

the iPhone 11 is the phone most people in the iOS ecosystem
('NEUTRAL', 0.20601189136505127, 0.011957553215324879, 0.782002866268158, 2.7693422453012317e-05)
this is the Pro review
('NEUTRAL', 0.01755661517381668, 0.001429559662938118, 0.9810057878494263, 8.045630238484591e-06)
The iPhone 11 is the reverse
('NEUTRAL', 0.058414336293935776, 0.0018085332121700048, 0.9397744536399841, 2.7406601930124452e-06)
There are three cameras on the iPhone 11
('NEUTRAL', 0.010222614742815495, 0.0004610633768606931, 0.9893151521682739, 1.121065338338667e-06)
This improvement is due to something Apple 's calling semantic rendering
('NEUTRAL', 0.4001956284046173, 0.027828281745314598, 0.5719702243804932, 5.85056523050298e-06)
This is all basically the same as the iPhone XS and the Pixel 3
('NEUTRAL', 0.015904804691672325, 0.08045540004968643, 0.903616189956665, 2.3550483092549257e-05)
The improvements to Smart HDR are applicable across cameras
('NEUTRAL', 0.47931650280952454, 0.00026124357827939093, 0.

In [13]:
from Evaluation import usefulnessScore

file = open("EvaluationText/mkbhdPixel.txt")
a = file.readlines()
file.close()
file = open("EvaluationText/unboxPixel.txt")
b = file.readlines()
file.close()
file = open("EvaluationText/Chromebook.txt")
c = file.readlines()
file.close()
file = open("EvaluationText/GoPro.txt")
d = file.readlines()
file.close()
file = open("EvaluationText/Airpods.txt")
e = file.readlines()
file.close()
file = open("EvaluationText/iphone11.txt")
f = file.readlines()
file.close()
evaluation = [a,b,c,d,e,f]

ImportError: cannot import name 'usefulnessScore' from 'Evaluation' (/Users/acyang@us.ibm.com/Desktop/w210/capstone/Adam-Stuff/Models/Evaluation.py)

In [15]:
from Evaluation import usefulnessScore

ImportError: cannot import name 'usefulnessScore' from 'Evaluation' (/Users/acyang@us.ibm.com/Desktop/w210/capstone/Adam-Stuff/Models/Evaluation.py)