In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TransformerMixin

import os
from collections import defaultdict
import pickle
import spacy
import pandas as pd
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
from pprint import pprint as pprint

class CTFIDFVectorizer(TransformerMixin):
    def __init__(self, norm='l1', *args, **kwargs):
        self.norm = norm

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        
        assert len(idf) == n_features
        self.idf = idf
        
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X.multiply(self.idf)
        X = normalize(X, axis=1, norm=self.norm, copy=False)
        return X
    
    def extract_relevant_features(self, X: sp.csr_matrix, n_top:int =10):
        """
        Assumes X has shape (n_classes, n_features)
        
        Note: Here we transform Xtfidf to dense, this could potentially cause memory problems
        """
        Xtfidf = self.transform(X)
        n_labels = Xtfidf.shape[0]
        
        feats_per_class = []
        for label in range(n_labels):
            best_feats = Xtfidf[label].toarray().flatten().argsort()[-n_top:]
            feats_per_class.append(best_feats)
            
        feats_per_class = np.array(feats_per_class)
        return feats_per_class

In [3]:
model_artifact_path = os.path.join(os.path.expanduser('~'), 'rfs-data/RFM_demo_tagging_big_dataset/')
text_vectorizer = os.path.join(os.path.expanduser('~'), 'rfm-data/count_vectorizer/')
parquet_path = os.path.join(os.path.expanduser('~'), 'rfm-data/full-reviews-cml-asins-with-PT-ITK/full_df.parquet')

df = pd.read_parquet(parquet_path,
                     columns=['asin', 'review_title', 'product_type', 'text_fragment'])

df["product_type"] = df["product_type"].astype("category")

print(len(df.asin.unique()), df.shape)

102100 (25976194, 4)


In [4]:
df_path = os.path.join(os.path.expanduser('~'), 'rfm-data/full-reviews-cml-asins-with-PT-ITK/df_asin_reviewtitle_pt_review.csv')

if os.path.exists(df_path):
    #pd.read_csv(df_path)
    pass
else:
    #df.to_csv(df_path)
    pass

In [5]:
top_N = 2_000_000
df = df[0:top_N]

In [6]:
df.review_title.iloc[6]

'Bought for class but absolutely loved it!'

We can sort colums of the selected ngrams from best to worst, according to the amoung of data we have for each PT.

In [7]:
values, counts = np.unique(df.product_type, return_counts=True)
ids_sorted_high_to_low = np.argsort(counts)[::-1]
soted_cols = values[ids_sorted_high_to_low]

# Retrieving relevant ngrams from reviews

In [8]:
n_top = 20

In [9]:
# Join documents per label
docs = pd.DataFrame({'Document': df.text_fragment[0:top_N], 'Class': df.product_type[0:top_N]})
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})
docs_per_class['nonempty'] = [len(x)!=0 for x in docs_per_class.Document]
docs_per_class = docs_per_class[docs_per_class['nonempty']][['Class','Document']]
target_names = docs_per_class.Class.values

In [10]:
target_names

['ABIS_BOOK', 'ABIS_KITCHEN', 'ABIS_MUSIC', 'ADVENT_CALENDAR', 'AIR_GUN', ..., 'WHISK_UTENSIL', 'WILDLIFE_FEEDER', 'WRITING_BOARD', 'WRITING_INSTRUMENT', 'YO_YO']
Length: 188
Categories (980, object): ['ABDOMINAL_EXERCISER', 'ABIS_BABY_PRODUCT', 'ABIS_BOOK', 'ABIS_DRUGSTORE', ..., 'WREATH', 'WRITING_BOARD', 'WRITING_INSTRUMENT', 'YO_YO']

In [11]:
%%time
# Create c-TF-IDF

cvec_params = {"ngram_range":(2,2),
               "token_pattern": r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"}

count_vectorizer = CountVectorizer(**cvec_params).fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
ctfidf_vec = CTFIDFVectorizer().fit(count, n_samples=len(docs))
ctfidf = ctfidf_vec.transform(count)

CPU times: user 3min 25s, sys: 16.2 s, total: 3min 41s
Wall time: 3min 46s


In [12]:
%%time 
top_feats = ctfidf_vec.extract_relevant_features(count, n_top=n_top)
words = count_vectorizer.get_feature_names_out()
words_per_class = {label:words[top_feats] for top_feats,label in zip(top_feats, target_names)}

CPU times: user 54.9 s, sys: 6.84 s, total: 1min 1s
Wall time: 1min 2s


In [13]:
j = 11
c_name = target_names[j]
print('class name =', c_name)
print('top words =', words_per_class[c_name])

class name = ASTRINGENT_SUBSTANCE
top words = ['used it' 'i used' 'my pores' 'my belly' 'i use' 'using it' 'i have'
 'been using' 'using this' 'use it' 'no stretch' 'the smell' 'this toner'
 'on my' 'stretch marks.' 'my face' 'this oil' 'this product' 'my skin'
 'stretch marks']


In [14]:
j = 30
c_name = target_names[j]
print('class name =', c_name)
print('top words =', words_per_class[c_name])

class name = BOTTLE_OPENER
top words = ['small to' 'it s' 'key chain' 'a keychain' 'it was' 'too small'
 'wars fan' 'it is' 'open bottles' 'smaller than' 'bottle cap' 'a bottle.'
 'as a' 'bottle opener.' 'the bottle' 'star wars' 'open a' 'to open'
 'a bottle' 'bottle opener']


In [15]:
pd.DataFrame(words_per_class)[soted_cols]

Unnamed: 0,ABIS_BOOK,TABLETOP_GAME,BOARD_GAME,TOYS_AND_GAMES,TOY_BUILDING_BLOCK,KITCHEN,BLANK_BOOK,PET_TOY,HEADPHONES,SKIN_MOISTURIZER,...,COOKIE_CUTTER,THREAD_CORD,SWEATSHIRT,UNINTERRUPTIBLE_POWER_SUPPLY,WASTE_BAG,FUNNEL,SKIN_FOUNDATION_CONCEALER,DISHWARE_PLACE_SETTING,LEASH,FIGURINE
0,as a,for the,br br,and the,with these,ice tea,book is,the dog,and they,dark circles,...,sharp star,ie got,jacket. two,mr. theil.,biodegradable makes,18 10,delivering was,collect nick-knacks,system nice,prices are
1,if you,i love,the board,in the,loves them,the microwave,gravity falls,have a,my ears,and i,...,shape fairly,quality butcher's,good jacket.,ever arriving.,so comparable,neck of,beige which,nick-knacks i,sometimes pulls,magnets are
2,this book.,it is,it is,bought this,stack them,to clean,the cover,tennis ball,for my,and it,...,sizes rather,of strings.,hood keeps,seller peter,composting at,wide mouth,product applies,perfectly registered.,undone easily.,son didn
3,to be,and i,this is,gift for,the cups,air popper,the journal,the stuffing,i have,using it,...,cutters just,handy.. and,head. better,libertarianism is,compost pick,10 stainless,sand beige,china set,leash uncouples.,provide. i
4,to read,if you,is a,easy to,these blocks,the pitcher,for a,loves this,quality is,my eyes,...,cutters have,laces dental,xxlarge and,going return,corn cobs,into jars,beige is,"christmas"". i",hips instead,set 34
5,i was,on the,to learn,it is,blocks are,this is,of the,our dog,the ear,the scent,...,cutters is,uses hang,flufy material,walden pond.,them leak,jars or,fingers dirty,dishes settings,around hips,board. but
6,i have,these cards,play with,is a,these are,it is,journal is,it s,headphones for,skin is,...,rockstar birthday,tie parchment,two small.,undeliverable' when,when heavier,a funnel,10 natural,wood i've,"the bungee""",pieces he
7,to the,in the,easy to,br br,my son,this peeler,on the,tennis balls,my son,dry skin,...,to store,always have,hoodie that,refund' process,leaking. definitely,funnel is,tan 5,added place,waist leashes,same. unfortunately
8,it was,is a,of the,a great,to play,i have,my son,dog is,br br,use it,...,the different,of cooking,hoodie just,for paypal,curbside compost,funnel that,makeup ever,the quality,leashes because,did receive.
9,and i,this is,year old,to play,to stack,the tea,this is,dogs love,headphones are,using this,...,to clean,use it,for sweat,arrives says,these are,funnel for,dirty delivering,on christmas,and lunges,magnet board.


# Retrieving relevant ngrams from review titles

In [16]:
n_top = 20

In [17]:
cvec_params = {"ngram_range":(2,2),
               "token_pattern": r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"}

#cvec_params = {"ngram_range":(2,2),
#               "stop_words": 'english'}

# Join documents per label
docs = pd.DataFrame({'Document': df.review_title[0:top_N],
                     'Class': df.product_type[0:top_N]})

docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})
docs_per_class['nonempty'] = [len(x)!=0 for x in docs_per_class.Document]
docs_per_class = docs_per_class[docs_per_class['nonempty']][['Class','Document']]
target_names = docs_per_class.Class.values

# Create c-TF-IDF
count_vectorizer = CountVectorizer(**cvec_params).fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
ctfidf_vec = CTFIDFVectorizer().fit(count, n_samples=len(docs))
ctfidf = ctfidf_vec.transform(count)

In [18]:
top_feats = ctfidf_vec.extract_relevant_features(count, n_top=n_top)
words = count_vectorizer.get_feature_names_out()
words_per_class = {label:words[top_feats] for top_feats,label in zip(top_feats, target_names)}

In [19]:
pd.DataFrame(words_per_class)[soted_cols]

Unnamed: 0,ABIS_BOOK,TABLETOP_GAME,BOARD_GAME,TOYS_AND_GAMES,TOY_BUILDING_BLOCK,KITCHEN,BLANK_BOOK,PET_TOY,HEADPHONES,SKIN_MOISTURIZER,...,COOKIE_CUTTER,THREAD_CORD,SWEATSHIRT,UNINTERRUPTIBLE_POWER_SUPPLY,WASTE_BAG,FUNNEL,SKIN_FOUNDATION_CONCEALER,DISHWARE_PLACE_SETTING,LEASH,FIGURINE
0,this is,easy to,great for,for my,good quality,good product,a great,t last,son loves,good moisturizer,...,cutters cute,twine just,small really,again good,slightly larger,for 21,flaps break,set the,flaps beautiful,flaps beautiful
1,is a,to play,to learn,toy for,fun for,popcorn popper,good quality,dogs favorite,book for,works great,...,from ateco,heavy twine,expected like,s my,pickup and,wide mouth,in one,smile on,flaps buy,flaps aren
2,the best,of the,for the,toy great,for babies,great popcorn,this journal,your dog,year old,for me,...,cutouts. good,time tie,really warm,seller again,both city,21 good,the wrong,the table,flaps baby,flaps are
3,great for,family game,fun family,stars five,love these,great tea,for a,the ball,this book,face cream,...,steel cookie,twine and,it. need,never buy,well sturdy,nice funnel,all in,puts a,flaps aren't,flaps and
4,the book,for the,stars five,fun and,great gift,the best,cute book,toy my,good headphones,body oil,...,cutters fun,cotton twine,comfortable love,can give,defective. works,junk from,sent the,through out,flaps aren,flaps amazing
5,to read,fun for,good game,perfect for,stacking cups,it works,great quality,this toy,sound for,it works,...,cutters great,all cotton,week liked,the service,home composting,end. junk,too heavy,the winter,flaps are,flaps adorable
6,a great,fun fun,family fun,for a,perfect for,and easy,loved it,love it,cute book,love this,...,rusty disappointed,uses around,it zipper,give zero,little trash,smaller end.,without being,in tact,flaps and,flaps aren't
7,a must,love this,fun fun,fun for,year old,works well,book great,loves this,book great,this product,...,ateco cut,things. true,hoodie. pockets,excellent interesting,these bags,versatile wide,easy transaction,these arrived,flaps amazing,too small
8,great read,stars five,great family,kids love,month old,ice tea,this book,five stars,headphones great,great for,...,of cutters,size many,pockets ripped,service if,ties 1,liquid funnel,one nice,face when,flaps be,not received
9,of the,great for,board game,of fun,favorite toy,air popper,love this,toy for,dog man,great moisturizer,...,disappointed storage,cathy's review,pockets seams,interesting would,bags leaky,china funnel,wrong color,table with,flaps boring,received this


Note that `'stars stars'` appears as ngram in `'NUTRITIONAL_SUPPLEMENT'`. 

This seems quite improvable. Does `'star wars'` really actually appear in any title? NO!

In [20]:
flag_vec = ['star wars' in x.lower() for x in  df[df.product_type=='NUTRITIONAL_SUPPLEMENT'].review_title.values]
np.sum(flag_vec)

0

Note that `'star wars'` actually appears in some PTs such as `'ABIS_BOOK'`

In [21]:
flag_vec = ['star wars' in x.lower() for x in  df[df.product_type=='ABIS_BOOK'].review_title.values]
np.sum(flag_vec)

404

This suggests that we can post process the returned ngrams by simply checking their appearance in the data.
Doing so, does it improve the quality?

In [22]:
df_key_themes_titles = pd.DataFrame(words_per_class)[soted_cols]

In [23]:
def check_ngram_in_database(df, pt, ngram):
    return np.sum([ngram in x.lower() for x in  df[df.product_type==pt].review_title.values])

In [24]:
def filter_ngrams(df_themes, df):
    cols = df_themes.columns
    ngram_counts = {}
    for col in cols:
        ngram_counts[col] = []
        for ngram in df_themes[col]:
            counts = np.sum(check_ngram_in_database(df, col, ngram))
            ngram_counts[col].append(counts)
    return ngram_counts

In [25]:
ngram_counts = filter_ngrams(df_key_themes_titles, df)

In [26]:
pd.DataFrame(ngram_counts).head()

Unnamed: 0,ABIS_BOOK,TABLETOP_GAME,BOARD_GAME,TOYS_AND_GAMES,TOY_BUILDING_BLOCK,KITCHEN,BLANK_BOOK,PET_TOY,HEADPHONES,SKIN_MOISTURIZER,...,COOKIE_CUTTER,THREAD_CORD,SWEATSHIRT,UNINTERRUPTIBLE_POWER_SUPPLY,WASTE_BAG,FUNNEL,SKIN_FOUNDATION_CONCEALER,DISHWARE_PLACE_SETTING,LEASH,FIGURINE
0,12103,242,282,140,139,115,104,136,74,34,...,0,0,0,0,1,0,0,1,0,0
1,20965,172,230,97,123,87,93,48,76,49,...,1,0,0,1,1,1,1,1,0,0
2,13542,275,280,0,109,78,59,72,77,72,...,0,0,1,1,1,0,1,1,0,0
3,14018,170,146,0,138,80,184,68,110,42,...,1,0,0,1,0,1,1,1,0,0
4,16333,268,0,124,171,168,103,9,37,44,...,0,1,0,1,0,1,1,1,0,0


In [27]:
import copy

def rerank_dfs(df_themes, ngram_counts):
    df_themes = copy.deepcopy(df_themes)
    cols = df_themes.columns
    ngram_counts = pd.DataFrame(ngram_counts)
    for col in cols:
        for ngram in df_themes[col]:
            ids_sorted = np.argsort(ngram_counts[col])[::-1]
            sorted_values = df_themes[col].values[ids_sorted]
            df_themes[col] = sorted_values
            ngram_counts[col] = ngram_counts[col].values[ids_sorted]
    return df_themes, ngram_counts

In [28]:
df_key_themes_titles

Unnamed: 0,ABIS_BOOK,TABLETOP_GAME,BOARD_GAME,TOYS_AND_GAMES,TOY_BUILDING_BLOCK,KITCHEN,BLANK_BOOK,PET_TOY,HEADPHONES,SKIN_MOISTURIZER,...,COOKIE_CUTTER,THREAD_CORD,SWEATSHIRT,UNINTERRUPTIBLE_POWER_SUPPLY,WASTE_BAG,FUNNEL,SKIN_FOUNDATION_CONCEALER,DISHWARE_PLACE_SETTING,LEASH,FIGURINE
0,this is,easy to,great for,for my,good quality,good product,a great,t last,son loves,good moisturizer,...,cutters cute,twine just,small really,again good,slightly larger,for 21,flaps break,set the,flaps beautiful,flaps beautiful
1,is a,to play,to learn,toy for,fun for,popcorn popper,good quality,dogs favorite,book for,works great,...,from ateco,heavy twine,expected like,s my,pickup and,wide mouth,in one,smile on,flaps buy,flaps aren
2,the best,of the,for the,toy great,for babies,great popcorn,this journal,your dog,year old,for me,...,cutouts. good,time tie,really warm,seller again,both city,21 good,the wrong,the table,flaps baby,flaps are
3,great for,family game,fun family,stars five,love these,great tea,for a,the ball,this book,face cream,...,steel cookie,twine and,it. need,never buy,well sturdy,nice funnel,all in,puts a,flaps aren't,flaps and
4,the book,for the,stars five,fun and,great gift,the best,cute book,toy my,good headphones,body oil,...,cutters fun,cotton twine,comfortable love,can give,defective. works,junk from,sent the,through out,flaps aren,flaps amazing
5,to read,fun for,good game,perfect for,stacking cups,it works,great quality,this toy,sound for,it works,...,cutters great,all cotton,week liked,the service,home composting,end. junk,too heavy,the winter,flaps are,flaps adorable
6,a great,fun fun,family fun,for a,perfect for,and easy,loved it,love it,cute book,love this,...,rusty disappointed,uses around,it zipper,give zero,little trash,smaller end.,without being,in tact,flaps and,flaps aren't
7,a must,love this,fun fun,fun for,year old,works well,book great,loves this,book great,this product,...,ateco cut,things. true,hoodie. pockets,excellent interesting,these bags,versatile wide,easy transaction,these arrived,flaps amazing,too small
8,great read,stars five,great family,kids love,month old,ice tea,this book,five stars,headphones great,great for,...,of cutters,size many,pockets ripped,service if,ties 1,liquid funnel,one nice,face when,flaps be,not received
9,of the,great for,board game,of fun,favorite toy,air popper,love this,toy for,dog man,great moisturizer,...,disappointed storage,cathy's review,pockets seams,interesting would,bags leaky,china funnel,wrong color,table with,flaps boring,received this


In [29]:
most_frequent_features_per_product_type, ngram_counts = rerank_dfs(df_key_themes_titles, ngram_counts)

In [30]:
most_frequent_features_per_product_type

Unnamed: 0,ABIS_BOOK,TABLETOP_GAME,BOARD_GAME,TOYS_AND_GAMES,TOY_BUILDING_BLOCK,KITCHEN,BLANK_BOOK,PET_TOY,HEADPHONES,SKIN_MOISTURIZER,...,COOKIE_CUTTER,THREAD_CORD,SWEATSHIRT,UNINTERRUPTIBLE_POWER_SUPPLY,WASTE_BAG,FUNNEL,SKIN_FOUNDATION_CONCEALER,DISHWARE_PLACE_SETTING,LEASH,FIGURINE
0,great book,five stars,great game,five stars,five stars,easy to,five stars,my dog,for the,five stars,...,cookie cutters,five stars,pockets seams,buy from,similar bags,funnel won't,being too,table with,comes apart,too small
1,five stars,fun game,five stars,great for,great for,to use,great book,dog loves,five stars,great product,...,from ateco,many uses,really warm,never buy,these bags,funnel for,all in,the winter,safe and,this item
2,this book,great game,fun game,love it,great toy,five stars,love it,loves it,the price,love it,...,steel cookie,all cotton,warm comfy,can give,slightly larger,funnel when,sent the,in tact,and effective,nice magnets
3,book for,love it,game for,great gift,toy for,works great,this book,five stars,great book,sensitive skin,...,my stars,uses around,shinny material,the service,bags used,funnel with,best makeup,face when,nice product,not received
4,love it,great for,family game,year old,great product,love it,for a,favorite toy,great sound,love this,...,nice edges,cathy's review,zipper did,star-ups ideas,trash bags,nice funnel,without being,china that,great quality,double pieces
5,good book,d d,this game,for a,loves them,tea maker,book for,love it,this book,great for,...,july party,cooking twine,seams tear,s my,pickup and,better last,covers without,smile on,hands free,received double
6,is a,love this,fun for,for kids,year old,great product,great for,dog toy,great for,good product,...,of cutters,didn't string,comfy hoodie.,for star-ups,both city,larger chunks,makeup ever,the table,flaps amazing,box is
7,love this,game for,to play,loves it,for my,iced tea,love this,great toy,year old,eye cream,...,stars nice,best twine,pockets ripped,undeliverable understanding,home composting,good funnel,in one,puts a,flaps be,is too
8,must read,of the,great for,great product,loves these,best peeler,beautiful journal,dogs love,sound quality,for sensitive,...,edges cut,cotton twine,hoodie. pockets,seller again,little trash,wide mouth,the wrong,through out,effective great,have not
9,great read,for the,for the,gift for,great gift,the best,great gift,t last,book for,my skin,...,playdoh durable,rolls you,it. need,brilanti books,sturdy bags,regular liquid,wrong color,beautiful china,product hands,received this


## Fragment Selector

Select the ngrams that are meaningfull to avoid non usefull information such as "love love" or "stars stars". Other fragments are incomplete, such as:
    
- "difficut to" could be inside:
    - "difficult to break" which would be a positive sentence.
    - "difficult to love" which would be a negative sentence.
        
Ideally we want to find n-grams that have "complete sentence meaning".

Todo this we must be able to identify whether a string is acomplete sentence or not.

We can build a model that learns to target if a phrase is valid or not.

In [31]:
import sys
sys.path.append('../01_rfm_code/')
sys.path.append('./')
import RFM
import valid_sentences, invalid_sentences
from RFM import build_POS_tuple, build_POS_prototypes, build_POS_tuple, POSVectorizer
import numpy as np

complete_sentences = list(valid_sentences.complete_sentences)
incomplete_sentences = list(invalid_sentences.invalid_examples)

all_sentences = incomplete_sentences + complete_sentences
labels = np.array([0]*len(incomplete_sentences) + [1]* len(complete_sentences))

print(f'len(all_sentences)={len(all_sentences)}, len(labels)={len(labels)}')

len(all_sentences)=1230, len(labels)=1230


In [32]:
POSVectorizer(languages=['english'])

In [33]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression

cvec_params = {"ngram_range":(2,2),
               "token_pattern": r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"}

model_features = FeatureUnion([('cvec', CountVectorizer(**cvec_params)),
                               ('posvec', POSVectorizer(languages=['english']))])
                       
fragment_selector = Pipeline([('features', model_features),
                              ('logistic', LogisticRegression())])

fragment_selector.fit(all_sentences, labels)

In [98]:
fragment_selector.predict_proba(['great for cleaning'])

array([[0.13947665, 0.86052335]])

In [35]:
fragment_selector.predict(['great for cleaning'])

array([1])

In [36]:
pos_model = fragment_selector.steps[0][1].transformer_list[1][1]

In [37]:
pos_model.transform_pos_tags(["Good for 5 years old kids and up"])

[('ADJ', 'ADP', 'NUM', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'ADV')]

#### cleaned fragments per product_type

In [49]:
fppt = {} # features per product type

for product_type in tqdm(most_frequent_features_per_product_type):
    fppt[product_type] = []
    for aspect in most_frequent_features_per_product_type[product_type]:        
        if fragment_selector.predict([aspect]) == 1:
            fppt[product_type].append(aspect)

  0%|          | 0/20 [00:00<?, ?it/s]

We can use the POS_prototypes to filter n-grams and accept only those that comply with the valid prototypes

In [87]:
%%time
from pathlib import Path
import json

fppt_path = "./data/out_diary_asin_tagger/fppt.json"
Path(fppt_path).parent.mkdir(exist_ok=True, parents=True)

if os.path.exists(fppt_path):
    print(f'{fppt_path} already exists! loading from disk...')
    with open(fppt_path) as f:
        fppt = json.load(f)
else:
    print(f'{fppt_path} does not exist, creating and storing to disk!')    
    with open(fppt_path, 'w') as f:
        json.dump(fppt, f)
        print(f'fppt stored to {fppt_path}')
                
#min_ngrams_in_product_type = min([len(fppt[x]) for x in  fppt])
#fppt_prunned = {product_type:fppt[product_type][0:min_ngrams_in_product_type] for product_type in fppt}
#fppt_prunned_table = pd.DataFrame(fppt_prunned).T

./data/out_diary_asin_tagger/fppt.json already exists! loading from disk...
CPU times: user 1.26 ms, sys: 855 µs, total: 2.12 ms
Wall time: 1.65 ms


In [None]:
#cat data/out_diary_asin_tagger/fppt.json

In [None]:
fppt

## Fragment Aggregator


Aggregate relevant fragments into "fragment templates" which are sets of fragments with similar semantic meaning



In [96]:
#!open data

## Selecting Asins that verify a given property 

This section produces a dataset with the following schema

```
{
    "property_1": list_of_asins_1`,
    "property_2": list_of_asins_2`
        .
        .
        .
    "property_n": list_of_asins_n`        
}
```


In [None]:
asins = df.asin.unique()

In [None]:
asin_pos = df.asin.searchsorted(asins[2])

In [None]:
df[df.asin==asins[2]].iloc[0].product_type

In [None]:
asin_slice = df[df.asin==asins[0]]
asin_titles = asin_slice['review_title'].values
asin_pt = asin_slice.iloc[0].product_type

In [None]:
def label_asins(df, df_themes, asins, n_min_support = 3):
    """
    Labels an asin according to a property in `df_themes` if a minimum support `n_min_support` is achieved.
    """
    asin_to_labels = {}
    asin_to_pt = {}
    
    for asin in tqdm(asins):
        asin_slice = df[df.asin==asin]
        asin_titles = asin_slice['review_title'].values
        asin_pt = asin_slice.iloc[0].product_type

        asin_to_labels[asin] = []
        asin_to_pt[asin] = asin_pt
        #pos = df.asin.searchsorted(asin)
        #pt = df.product_type[pos]
        #titles = df[df.asin==asin]['review_title'].values
        
        for ngram in df_themes[asin_pt]:
            ngram_counts = np.sum([ngram in text for text in asin_titles])
            if ngram_counts >= n_min_support:
                asin_to_labels[asin].append((ngram, ngram_counts))

    return asin_to_labels, asin_to_pt

In [None]:
%%time 
asin_to_labels, asin_to_pt = label_asins(df, df_key_themes_titles_reranked, asins)

In [None]:
asin_to_labels['0307719774']

In [None]:
df_results = pd.DataFrame([asin_to_labels]).T
df_results = df_results.reset_index()
df_results = df_results.rename(columns={'index':'asin', 0:'labels'})

In [None]:
df_results.to_csv('asin_labels.csv', columns=['asin', 'labels'])

In [None]:
pd.read_csv('asin_labels.csv', index_col=0)

In [None]:

batch_size = 500
n_jobs = 10

def get_batches(s, n, truncate=False):
    assert n > 0
    while len(s) >= n:
        yield s[:n]
        s = s[n:]
    if len(s) and not truncate:
        yield s


def label_asins(df, df_themes, asins, n_min_support = 3):

    asin_to_labels = {}
    asin_to_pt = {}
    
    for asin in asins:
        asin_slice = df[df.asin==asin]
        asin_titles = asin_slice['review_title'].values
        asin_pt = asin_slice.iloc[0].product_type

        asin_to_labels[asin] = []
        asin_to_pt[asin] = asin_pt
        #pos = df.asin.searchsorted(asin)
        #pt = df.product_type[pos]
        #titles = df[df.asin==asin]['review_title'].values
        
        for ngram in df_themes[asin_pt]:
            ngram_counts = np.sum([ngram in text for text in asin_titles])
            if ngram_counts >= n_min_support:
                asin_to_labels[asin].append((ngram, ngram_counts))

    return asin_to_labels, asin_to_pt
    
def label_asins_parallel(df, df_themes, asins, n_min_support = 3):
    
    list_asin_to_labels, list_asin_to_pt = Parallel(n_jobs=n_jobs)(delayed(label_asins)(df, df_themes, batch_size_asins) for batch_asins in get_batches(asins, batch_size))

    return list_asin_to_labels, list_asin_to_pt