In [1]:
from pandas import DataFrame, concat, read_csv, Series
from time import sleep
import re
from nltk.tokenize import word_tokenize
import spacy
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk import pos_tag
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_md')

In [30]:
comments_df = read_csv("../data/comments.csv")
tokens = comments_df["comment"].map(lambda s: word_tokenize(str(s).lower()))
tokens = tokens.map(lambda list_words: [i for i in list_words if i.isalpha() and i not in stop_words])
token_freq = Series(tokens.sum()).value_counts()

In [41]:
token_df = DataFrame(token_freq, columns=["frequency"]).reset_index().rename(columns={"index": "token"})
token_df["pos"] = token_df["token"].map(lambda x: pos_tag([x])[0][1])

In [43]:
token_df.to_csv("../data/word_frequencies.csv", index=False)

1. Multi
    - What are you looking for in your cereal?
        - taste
        - flavor
        - nutrition
        - sweet
        - crunch
    - What else do you want your cereal to have?
        - marshmallow
        - honey
        - fruit

2. Single
    - Do you have any preference for the Grain?
        - oats
        - wheat
        - rice
        - corn

### Cereal to feature association

1. Sentiment Analyzer

In [2]:
senti_analyzer = SentimentIntensityAnalyzer()

def review_sentiment(review: str):
    score = senti_analyzer.polarity_scores(review)
    return score["compound"]

2. Similarity between two documents

In [3]:
def get_spacy_word_vector_similarity(text1: str, text2: str):
    text1 = nlp(text1)
    text2 = nlp(text2)
    return text1.similarity(text2)

3. Create all possible combinations of the features that may be selected by the user

In [4]:
multi_select = dict([(chr(x[0]), x[1]) for x in enumerate(["taste", "flavor", "nutrition", "sweet", "crunch"],start=65)])
single_select_1 = dict([(chr(x[0]), x[1]) for x in enumerate(["oats", "wheat", "rice", "corn"], start=70)])
single_select_2 =  dict([(chr(x[0]), x[1]) for x in enumerate(["marshmallow", "honey", "fruit"],start=74)])

In [5]:
multi_combinations = list()
for i in range(len(multi_select.keys())+1):
    multi_combinations += list(combinations(multi_select.keys(), i))

multi_combinations[:10]

[(),
 ('A',),
 ('B',),
 ('C',),
 ('D',),
 ('E',),
 ('A', 'B'),
 ('A', 'C'),
 ('A', 'D'),
 ('A', 'E')]

4. Calculate similarity for all combinations

In [6]:
final_combinations = list()

for i in single_select_1.keys():
    final_combinations += [tuple(list(tup) + [i]) for tup in multi_combinations]
for i in single_select_2.keys():
    final_combinations += [tuple(list(tup) + [i]) for tup in multi_combinations]
for i in single_select_1.keys():
    for j in single_select_2.keys():
        final_combinations += [tuple(list(tup) + [i, j]) for tup in multi_combinations]
final_combinations += multi_combinations

final_combinations = [tuple(sorted(list(tup))) for tup in final_combinations if len(tup) > 0]
len(final_combinations)

639

In [8]:
comments_df = read_csv("../data/comments.csv")
product_df = read_csv("../data/product_rating.csv").rename(columns={"rating":"rank"})
reviews_df = comments_df.merge(product_df, on="product_id")
reviews_df["comment"] = reviews_df["comment"].map(str)
reviews_df.head()

Unnamed: 0,user,rating,comment,product_id,rank,name
0,Dino Bravo,5,Still remember the taste of it to this day. On...,4,1034,Addams Family Cereal
1,P. Crackers,5,Pretty much what everyone else is saying... th...,4,1034,Addams Family Cereal
2,Jeff H.,5,This was the,4,1034,Addams Family Cereal
3,Kaboodle Doodle,5,This was one of my favorites from the early 90's.,4,1034,Addams Family Cereal
4,K.Y.,5,I remember this cereal for some reason. I was ...,4,1034,Addams Family Cereal


In [108]:
reviews_df["sentiment_score"] = reviews_df["comment"].map(review_sentiment)
reviews_df.head()

Unnamed: 0,user,rating,comment,product_id,rank,name,sentiment_score
0,Dino Bravo,5,Still remember the taste of it to this day. On...,4,1034,Addams Family Cereal,0.6369
1,P. Crackers,5,Pretty much what everyone else is saying... th...,4,1034,Addams Family Cereal,0.937
2,Jeff H.,5,This was the,4,1034,Addams Family Cereal,0.0
3,Kaboodle Doodle,5,This was one of my favorites from the early 90's.,4,1034,Addams Family Cereal,0.4215
4,K.Y.,5,I remember this cereal for some reason. I was ...,4,1034,Addams Family Cereal,0.0


In [109]:
# reviews_df.to_csv("../data/reviews.csv", index=False)

In [11]:
reviews_df = read_csv("../data/reviews.csv")
reviews_df["comment"] = reviews_df["comment"].map(str)

count = 0
for comb in final_combinations:
    col_name = "".join(map(str, sorted(list(comb))))
    if col_name+"_sim" in reviews_df.columns:
        continue
    attributes = list()
    for key in comb:
        if key in single_select_1:
            attributes.append(single_select_1[key])
        elif key in single_select_2:
            attributes.append(single_select_2[key])
        else:
            attributes.append(multi_select[key])
    attributes = " ".join(attributes)
    reviews_df[col_name+"_sim"] = reviews_df["comment"].map(lambda x: get_spacy_word_vector_similarity(x, attributes))
    reviews_df.to_csv("../data/reviews.csv", index=False)
    count+=1
    if count > 30:
        break

In [12]:
reviews_df = read_csv("../data/reviews.csv")
reviews_df["comment"] = reviews_df["comment"].map(str)
len(reviews_df.columns)

646

In [25]:
for comb in final_combinations:
    col_name = "".join(map(str, sorted(list(comb))))
    reviews_df[col_name] = reviews_df[col_name+"_sim"]*reviews_df["sentiment_score"]

In [27]:
final_scores = reviews_df.groupby(["product_id", "name"])[["rating"]+["".join(map(str, sorted(list(comb)))) for comb in final_combinations]].mean()
product_df = read_csv("../data/product_rating.csv").rename(columns={"rating":"rank"})
final_scores = final_scores.merge(product_df, on="product_id").set_index("product_id")
final_scores.to_json("../data/final_scores.json")

### Cereal to Cereal association

In [61]:
single_attribute_vector = final_scores[["".join(map(str, sorted(list(comb)))) for comb in final_combinations]]

In [64]:
def cosine_similarity(vec_a: Series, vec_b: Series):
    return np.dot(vec_a.sort_index(), vec_b.sort_index()) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

cereal_similarity_df = DataFrame([], columns=product_df["product_id"].values, index=product_df["product_id"].values)

for idx in cereal_similarity_df.index.values:
    for col in cereal_similarity_df.columns.values:
        if idx == col:
            cereal_similarity_df.loc[idx, col] = 0
        else:
            cereal_similarity_df.loc[idx, col] = cosine_similarity(single_attribute_vector.loc[idx], single_attribute_vector.loc[col])

cereal_similarity_df.head()

Unnamed: 0,300,144,1143,216,53,175,286,197,72,222,...,1344,542,1462,100,159,202,200,1173,450,709
300,0.0,0.999897,0.999819,0.999909,0.999707,0.999676,0.999776,0.999888,0.999694,0.99982,...,0.999526,0.999842,0.999105,0.999517,0.999944,0.999701,0.999235,0.99983,0.999869,0.999707
144,0.999897,0.0,0.999727,0.999801,0.99984,0.999854,0.999731,0.99983,0.999788,0.999862,...,0.9996,0.999836,0.999037,0.99958,0.999859,0.999727,0.999171,0.999679,0.999796,0.999709
1143,0.999819,0.999727,0.0,0.999964,0.999851,0.999675,0.999981,0.999615,0.999893,0.99989,...,0.999837,0.999775,0.999716,0.999868,0.999843,0.999901,0.999777,0.999378,0.999924,0.999904
216,0.999909,0.999801,0.999964,0.0,0.999805,0.999627,0.999937,0.999693,0.999818,0.9999,...,0.999686,0.99975,0.999518,0.999755,0.999886,0.999874,0.999615,0.999602,0.999917,0.999826
53,0.999707,0.99984,0.999851,0.999805,0.0,0.999845,0.999888,0.999561,0.999976,0.999945,...,0.999884,0.999786,0.999539,0.999892,0.999727,0.999895,0.999645,0.999206,0.999806,0.99988


In [82]:
def maxNIndex(elements: Series, n: int):
    return elements.sort_values(ascending=False).index.values[:n]

In [117]:
top_n = 5
cereals_df = DataFrame(cereal_similarity_df.apply(lambda x: maxNIndex(x, top_n), axis=1), columns=["array"])
for i in range(top_n):
    cereals_df["top_" + str(i+1)] = cereals_df["array"].map(lambda x: x[i]) #[i]
cereals_df = cereals_df.reset_index().rename(columns={"index":"product_id"})[["product_id"] + ["top_"+str(i+1) for i in range(top_n)]]
cereals_df = cereals_df.merge(product_df, on="product_id").set_index("name")[["top_"+str(i+1) for i in range(top_n)]]

product_df.set_index("product_id", inplace=True)
for i in range(top_n):
    cereals_df["top_"+str(i+1)] = cereals_df["top_"+str(i+1)].map(lambda x: product_df.loc[x, "name"])

In [127]:
cereals_df.T.to_json("../data/nearest_cereals.json")