In [36]:
import pandas as pd
import numpy as np
import spacy
import itertools
import csv
import re
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokenizer import Tokenizer
from spacy.pipeline import EntityRuler
from spacy.attrs import *
from spacy.symbols import *
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori , association_rules
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

# Global Variables
infile = 'apple_cleantweets_test.csv'
nlp = spacy.load('en_core_web_sm')
patternsfile = "entityruler_patterns.jsonl"
output = 'possible_aspects.csv'
nlp.Defaults.stop_words |= {"My","the","it", "a"}
ruler = EntityRuler(nlp).from_disk(patternsfile)
nlp.add_pipe(ruler, before="ner")

# Temp variable because we need to define dictionary value
ASP = "found"


# Temp dictionary until we run data to find frequent item sets
ASPECTS = {"battery": ASP, "battery life": ASP, "screen": ASP, 
            "screen size": ASP, "picture": ASP, "picture quality": ASP,
            "photo": ASP, "photos": ASP, "photo quality": ASP, "camera": ASP,
            "voice": ASP, "voice quality": ASP, "sound": ASP,"sound quality": ASP, 
            "software": ASP, "update": ASP, "cost": ASP, "price": ASP,
            "safety": ASP, "security": ASP, "speed": ASP, "home button": ASP , 
            "interface": ASP, "support": ASP, "customer support": ASP
          }



# Import tweet file
def import_unprocessed_tweets(infile):
    # Reads file stored in global var 'infile' into dict
    # Only stores text columns 
    # Nested lists are comprised of text,hashtag pairs
    with open(infile, 'r') as lines:
        reader = csv.DictReader(lines)
        all_tweets = []
        for line in reader:
            a_tweet = []
            text = line['text']
            a_tweet.append(text)
            all_tweets.append(a_tweet)
        # Returns list of lists containing tweet text
        print("Tweet import complete!")
        return all_tweets
    
    
# Helper function 1: Find noun phrase bigrams and/or trigrams in doc depending on token index
# Assumes there are at least three words in doc 
def noun_phrase_utility(doc, i):
    #Finds noun phrases in retokenized doc
 
    zerone = ""
    twoonezero = "" 
    onezero = ""
    zeronetwo = ""
    doc_lower = [token.lower_ for token in doc]
    phrases = []
    nps_set = set()
    # First index in list
    if i == 0:
        zerone = "{0} {1}".format(doc_lower[i],doc_lower[i + 1])
    
        zeronetwo = "{0} {1} {2}".format(doc_lower[i], doc_lower[i +1],
                                          doc_lower[i + 2])
        phrases = [zerone, zeronetwo]
    
    # Second index in list when doc is longer than three words
    elif i == 1 and len(doc_lower) > 3:
        onezero = "{0} {1}".format(doc_lower[i - 1], doc_lower[i])
        
        zerone = "{0} {1}".format(doc_lower[i],doc_lower[i + 1])
    
        zeronetwo = "{0} {1} {2}".format(doc_lower[i], doc_lower[i +1],
                                          doc_lower[i + 2])
        
        phrases = [onezero, zerone, zeronetwo]
        
    # Second index in list when doc is three words long
    elif i == 1 and len(doc_lower) == 3:
        onezero = "{0} {1}".format(doc_lower[i - 1], doc_lower[i])
        
        zerone = "{0} {1}".format(doc_lower[i],doc_lower[i + 1])
        
        phrases = [onezero, zerone]
    
    # Last index in list
    elif i == (len(doc_lower) - 1):
        onezero = "{0} {1}".format(doc_lower[i - 1], doc_lower[i])
        
        twoonezero = "{0} {1} {2}".format(doc_lower[i - 2],
                                          doc_lower[i - 1], doc_lower[i]) 
        
        phrases = [onezero, twoonezero]
    
    # Second to last index in list when doc is longer than three words
    elif i == (len(doc_lower) - 2):
        onezero = "{0} {1}".format(doc_lower[i - 1], doc_lower[i])
        
        zerone = "{0} {1}".format(doc_lower[i],doc_lower[i + 1])
        
        twoonezero = "{0} {1} {2}".format(doc_lower[i - 2],
                                          doc_lower[i - 1], doc_lower[i]) 
        
        phrases = [onezero, twoonezero, zerone]
   
    # Everything else    
    else: 
        zerone = "{0} {1}".format(doc_lower[i],doc_lower[i + 1])
    
        zeronetwo = "{0} {1} {2}".format(doc_lower[i], doc_lower[i +1],
                                          doc_lower[i + 2])
        
        onezero = "{0} {1}".format(doc_lower[i - 1], doc_lower[i])
        
        twoonezero = "{0} {1} {2}".format(doc_lower[i - 2],
                                          doc_lower[i - 1], doc_lower[i]) 
        
        phrases = [onezero, twoonezero, zerone, zeronetwo]

    return phrases


# Helper function 2: Will retokenize doc to remove stop words, punctuation and spaces
def re_tokenize(doc):
    text_list = []
    for token in doc:
        if token.is_stop != True and token.is_punct != True and token.is_space != True:
            text = token.text
            text_list.append(text)
    new_doc = ' '.join(text_list)
    doc = nlp(new_doc)
    
    return doc



# Helper Function 3: Find unique multiword aspects, return aspect list
def multi_word_aspects(new_doc):
    nps_set = set()
    for token in new_doc:
            if token.pos_ == "NOUN":
                if token.lemma_ in ASPECTS:
                    poss_aspects = noun_phrase_utility(new_doc, token.i)
                    for item in poss_aspects:
                        nps_set.add(item)
  
    nps_list = list(nps_set)
    
    return nps_list


# Helper Function 4: Find unique single word aspects, return aspect list
def single_word_aspects(new_doc):
    aspect_set = set()
    for token in new_doc:
        if token.pos_ == "NOUN":
            if token.lemma_ in ASPECTS:
                aspect_set.add(token.lemma_)
                
    aspect_list = list(aspect_set)
    return aspect_list            


# FUNCTION NOT USED
def contains_word(s, w):
    return f' {w} ' in f' {s} '


# Helper Function 5: Delete words that are found to be duplicated accross single and multiword aspect lists
# Used in merge_aspects
def delete_words(deletelist, mainlist):
    # Uses list of indeces to delete items from main_list
    new_list= [aspect for aspect in mainlist if aspect not in deletelist]
    # Returns list of single aspects not found in multiword aspect list
    return new_list


# Helper Function 6: Merges single and multiword aspect lists after duplicates are removed
# Used in opinion_finder
def merge_aspects(single_aspects, multi_aspects):
    # Convert multi-word aspect list to list of single words 
    by_word_aspects = []
    for asp in multi_aspects:
        words = asp.split(' ')
        by_word_aspects.append(words)
    
    single_words = list(itertools.chain.from_iterable(by_word_aspects))
    
    # Compare single word aspects to single_words, if match found, remove word from single words
    to_delete = []
    for s in single_aspects:
        if s in single_words:
            to_delete.append(s)
            
    clean_s_aspects = delete_words(to_delete, single_aspects)
    
    mergedlist = clean_s_aspects + multi_aspects
    return mergedlist
    

# FUNCTION NOT USED
def find_root_verb(doc):
    
    root_verb = [token for token in doc if (token.dep_ == "ROOT")]
        
    return root_verb

# FUNCTION NOT USED
def find_sentiment_words(doc):
    sentiment_words = []
    sentiment_words.append(([token for token in doc if 
                                     (not token.is_stop and not token.is_punct and 
                                      (token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ =="ADV"))]))
    # Return list of tokens that contain sentiment according to criteria
    return sentiment_words



# Helper Function 7: Finds opinions related to aspect terms in tweet
def opinion_rules(doc):
    subj = "000"
    obj = "000"
    mod = "000"
    neg = "000"
    pairs = []
    for token in doc:
        if token.dep_ == "nsubj":
            compound = [child.text for child in token.children if (child.dep_ == "compound")]
            modifier = [child.text for child in token.children if (child.dep_ == "amod")]
            comp = ''.join(compound)
            mod = ''.join(modifier)
            if compound: 
                subj = comp + " " + token.text
            elif modifier:
                subj = mod + " " + token.text
            else:
                subj = token.text
            
            
        if token.dep_ in ("ROOT"):
            for child in token.children:
                if child.dep_  == "dobj":
                    modifier = [c.text for c in child.children if (c.dep_ == "amod")]
                    if len(modifier) > 0:
                        mod = ''.join(modifier)
                        obj = mod + " " + child.text
                    else:
                        obj = child.text
                elif child.dep_ == "acomp":
                    obj = child.text
                
                if child.dep_ == "neg":
                    neg = child.text
    if subj != "000" and obj != "000":
        pairs.append([subj,  obj])
    return pairs


# Function will return list containing aspect and opinion associated with it if there is one
def opinion_finder(doc):
    pairs = []
    for sent in doc.sents:
        ops = opinion_rules(sent)
        pairs.append(ops)
    
    op_pair = []
    for pair in pairs:
        for p in pair:
            words = ' '.join(p)
            text = words.split()
            for word in text:
                if word.lower() in ASPECTS:
                    op_pair.append(words)

    return op_pair
    

# Function identifies aspects in doc (according to aspects in dictionary) and retursn sentiment score
def aspect_sentiment(doc):
    new_doc = re_tokenize(doc) # Retokenize doc to remove stopwords, punctuation and blank spaces
    m_aspects = multi_word_aspects(new_doc) # Find multi-word aspects
    
    s_aspects = single_word_aspects(new_doc) # Find single-word aspects
    
    # Merge the two while removing duplicate aspects
    # Only multi-word aspects are kept if match is found between two lists
    final_aspects = merge_aspects(s_aspects, m_aspects)
    #print(final_aspects)
    compound_score = 0
    scores = []
    tot = 0
    if final_aspects: 
        ops = opinion_finder(doc)
        for opinion in ops:
            compound_score = analyser.polarity_scores(opinion)
            scores.append(compound_score['compound'])
    if len(scores) == 1:
        return scores[0]
    elif len(scores) > 1:
        for num in scores:
            tot = tot + num
        average  = tot / len(scores)
        return average
    elif not scores:
        return compound_score
    

# Function finds aspects score for tweet based on aspect or sentence as a whole     
def tweet_sentiment(doc):
        
        # Find aspect score for aspects in tweet, if any
        aspect_score = aspect_sentiment(doc) 
        if aspect_score == 0.0:    #Aspects found but neutral score was given, sentiment will default to sentence sentiment
            default_score = analyser.polarity_scores(doc.text)
            def_score = default_score['compound']
            #print(def_score)
            return def_score
        elif score == 0:    #No aspects found, sentiment will default to sentence sentiment
            no_asp_score = analyser.polarity_scores(doc.text)
            noas_score = no_asp_score['compound']
            #print(noas_score) 
            return noas_score
        else:
            return aspect_score
            
 
        
#Returns tweet, with its sentiment score and its entities
#Need to add aspects found on here
def senti_process(tweets):
    for tweet in tweets:
        doc = nlp(tweet[0])
        doc_ents = []
        for token in doc: 
            if token.ent_type_ in ("ORG", "PRODUCT", "SERVICE", "SOFTWARE"):
                ent = token.ent_type_
                doc_ents.append([token.text, ent])
        
        score = tweet_sentiment(doc)
        
        print(doc, score, doc_ents)
    

    
    
all_tweets = import_unprocessed_tweets(infile)
senti_process(all_tweets)

#TESTING SENTENCES
#The screen size is horrible. At least the sound quality is okay.
#The camera does such a bad job of taking pictures! But the battery life is good
#The update does not fix the issues with the pictures
#The camera does such a bad job of taking pictures! My Cannon does far better
#doc = nlp("The camera does such a bad job of taking pictures! But the battery life is good")
#score = analyser.polarity_scores("camera bad job")
#print(score)



Tweet import complete!
It makes me chuckle when articles claim that the versa can compete with the Watch. I've had both and the Versa is like 15 years behind in every aspect. 0.6369 [['Versa', 'ORG']]
This was pretty cool! Thank you for having me. And thank you to everyone who came !! Had so much fun 0.9319 []
I'm pretty sure I just discovered that predicted the Apple Pencil in 2013? 0.6705 [['Apple', 'ORG']]
Court rules man must be given access to husband's iCloud photos  0.0 [['iCloud', 'SERVICE']]
Apple over ear headphones may launch this year to complement Air Pods and give Bose competition. 0.0 [['Apple', 'ORG'], ['Air', 'ORG'], ['Pods', 'ORG']]
Hey Apple it's funny 0.4404 [['Apple', 'ORG']]
Ha it already broke FUCK YOU Apple -0.6841 [['Apple', 'ORG']]
Apple shares rally 0.296 [['Apple', 'ORG']]
iPhone 11 concept hype continues as latest video offers more realistic vision of triple camera unit 0.3134 [['iPhone', 'PRODUCT']]
First time traveling without a headphone jack. Best case 

Good time long AAPL ; before earnings next Tuesday. Current 158. 0.4404 [['AAPL', 'ORG']]
Who would like to Win; an ipadpro;? 1. Like our page 2. Retweet this post 3. Type 'done' in the comments. giveaway; competition; contest; free; prize; iphone; apple; gadgets; style; 0.9371 [['Type', 'ORG'], ['iphone', 'PRODUCT'], ['apple', 'ORG']]
Price: $27.10 FREE Shipping casedeals;iphonecase;smartphonecases;samsungcases;xiaomicases;apple;huaweicase;cases; 0.6166 []
Commissioned by A Great Day In Accra Series Shot Oni Phone; Ghana; Music; Africa; Hiplife; Apple; AG reat Day In Accra; 0.6249 [['Apple', 'ORG'], ['AG', 'ORG']]
BRAND NEW RELEASE Dinosaur Safari: Evolution Mac Version Visit Mac App Store now to download for FREE Click Copy below link for FREE download dinosaur; dino; jurassic; jurassicpark; jurassicworld; assassin; mac; iphone; ipad; hunt; world; 0.8429 [['Mac', 'PRODUCT'], ['Mac', 'PRODUCT'], ['FREE', 'ORG'], ['mac', 'PRODUCT'], ['iphone', 'PRODUCT'], ['ipad', 'PRODUCT']]
Mmm.... I

All Android phones are good but IOS sucks -0.3182 [['Android', 'SOFTWARE'], ['IOS', 'SOFTWARE']]
Excuse my funky ass iPhone but here is a snippet of DeAnt Amazing Prod By. I'm sick as well so this won't be released until I'm better I'm not recording a song sick. It's impossible. My ass already got asthma -0.6428 [['iPhone', 'PRODUCT'], ['DeAnt', 'ORG'], ['Amazing', 'ORG'], ['Prod', 'ORG']]
I need the battery shortcut on the control panel please 0.3182 []
I got rid of my i Phone 6 because when you update and a new phone is launched the updates mess with ur phone it freezes -0.2732 []
Hey 0.0 []
My phone can't connect to my service wtf thx -0.3182 []
I absolutely LOVE these FUNCL AI wireless smart headphones. Battery life for days... literally superior sound quality. If you are considering Air Pods or any ear buds get these. Half the price of Air Pods and better specs. I just ordered another pair in black! 0.9395 [['Air', 'ORG'], ['Pods', 'ORG'], ['Air', 'ORG'], ['Pods', 'ORG']]
My iPad 