In [5]:
import pandas as pd
import numpy as np
import spacy
import csv
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokenizer import Tokenizer
from spacy.pipeline import EntityRuler
from spacy.attrs import *
from spacy.symbols import *
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori , association_rules


# Global Variables
infile = 'apple_cleantweets_test.csv'
nlp = spacy.load('en_core_web_sm')
patternsfile = "entityruler_patterns.jsonl"
output = 'possible_aspects.csv'
nlp.Defaults.stop_words |= {"My","the","it", "a"}
ruler = EntityRuler(nlp).from_disk(patternsfile)
nlp.add_pipe(ruler, before="ner")


# Import tweet file
def import_unprocessed_tweets(infile):
    # Reads file stored in global var 'infile' into dict
    # Only stores text columns 
    # Nested lists are comprised of text,hashtag pairs
    with open(infile, 'r') as lines:
        reader = csv.DictReader(lines)
        all_tweets = []
        for line in reader:
            a_tweet = []
            text = line['text']
            a_tweet.append(text)
            all_tweets.append(a_tweet)
        # Returns list of lists containing tweet text
        print("Tweet import complete!")
        return all_tweets
    

# Find possible aspects using noun_chunks provided by spacy. This will write results to csv file.
def find_aspect_terms(tweet_list):
    #with open(output, "a") as csv_file:
    poss_aspect = []
    for tweet in tweet_list:
        chunks = []
        text = tweet[0]   # accesses tweet text
        doc = nlp(text)
        for chunk in doc.noun_chunks:
            chunks.append(chunk.text)
        poss_aspect.append(chunks)        
    return poss_aspect
    

# Helper Function 1: Removes stop words     
def not_stop_word(token):
    value = token.is_stop != True and token.is_punct != True and token.is_space != True
    return value 


# Helper Function 2: Skips tokens that are tagged as entities 
def not_ent(token):
    value = token.ent_type
    return value


# Helper Function 3: Finds tokens that have the stated dependency tags 
def dep_option(token):
    value = token.dep_ in ["nsubj", "dobj", "pobj", "nsubjpass"] 
    return value      


# Create list from aspect file            
def import_aspect_list(inputfile):
    with open(inputfile, mode='r') as infile:
        reader = csv.reader(infile)
        aspect_list = []
        for row in reader:
            aspect_list.append(row)
    return aspect_list


# Cleanup function: Post aspect generation clean-up to ensure noun_chunks comply with criteria
def post_clean(possible_candidate_list):
    aspects_clean = []
    for row in possible_candidate_list:
        item_list = []
        for item in row:
            doc = nlp(item)
            term_list = []
            for token in doc:
                if remove_pos(token) and not_stop_word(token) and not_ent(token) == 0: 
                    term_list.append(token.text)   
            if len(term_list) > 0 and len(term_list) < 5:
                phrase = ' '.join(term_list)
                item_list.append(phrase)
        aspects_clean.append(item_list)
    # Returns list of nonstop words
    return aspects_clean  


# Helper function 4: Removes tokens with stated pos tags from possible aspect list
def remove_pos(token):
    value = token.pos_ not in ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "PART", "VERB"] 
    return value



# Final list of possible aspect terms
def candidate_aspect_list(listname, outfile):
    with open(outfile, "a") as csv_file:
        aspects_clean = []
        for item in listname:
            print(item)
            writer = csv.writer(csv_file, delimiter=',')
            writer.writerow(item)
    return aspect_clean
 
    
# Create frequent aspect using apriori algorithm    
def frequent_itemsets(aspect_list):
    # Takes as input a list of lists containing candidate aspects  
    tenc = TransactionEncoder()
    array = tenc.fit(aspect_list).transform(aspect_list)
    df = pd.DataFrame(array, columns=tenc.columns_)
    frequent_items = apriori(df, min_support=0.016, use_colnames=True)
    return frequent_items



    
# RUN CODE IN THIS ORDER

all_tweets = import_unprocessed_tweets(infile)
#print(all_tweets)
aspect_candidates = find_aspect_terms(all_tweets)
#print(aspect_candidates)
clean_list = post_clean(aspect_candidates)
#print(clean_list)
final_list = candidate_aspect_list(clean_list, output)
#print(final_list)

aspects = import_aspect_list(output) 
#print(clean_aspects('testing_this_aspects.csv'))
fis_gen =frequent_itemsets(aspects)
print(fis_gen)




    support    itemsets
0  0.023529  (FaceTime)
1  0.029412    (camera)
2  0.023529     (issue)
3  0.017647     (music)
4  0.017647      (pair)
5  0.017647     (price)
6  0.017647   (support)
