In [None]:
import codecs
import random
import math
import numpy as np
import scipy
import scipy.stats
from scipy.stats import gamma, kstest
import pandas as pd
#import spacy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
!pip install vaderSentiment
import vaderSentiment.vaderSentiment as vader
from sklearn.cluster import KMeans
import re

In [None]:
import altair as alt
from detail.altairdf import altairDF
alt.renderers.enable("notebook")

Parquet files used in this notebook were created from the raw Kaggle CSV as follows:
```python
with open("data/amazon-reviews.csv") as f:
    data = pd.read_csv(f)
data = data.sample(10000).reset_index(drop=True)
data = data.drop(columns=["Id", "ProductId", "UserId", "ProfileName", "Time", "Summary"])
data["hscore"] = \
    data.apply(lambda row: (1+row["HelpfulnessNumerator"]) / (2+row["HelpfulnessDenominator"]), axis=1)
data = data.drop(columns=["HelpfulnessNumerator", "HelpfulnessDenominator"])
data = data.rename(columns={"Score":"score", "Text":"text"})
data = data[["score", "hscore", "text"]]
data.to_parquet("data/amazon-reviews-10K.parquet", compression="brotli")
```

In [None]:
def filterdf(df, pred):
    return df.loc[[idx for idx in df.index if pred(df.loc[idx])]]
def showtxt(df, subset = ["text"]):
    return df.style \
             .applymap(lambda x: 'white-space:wrap', subset=subset) \
             .applymap(lambda x:'text-align:left', subset=subset)

In [None]:
reviews = pd.read_parquet("data/amazon-reviews-50K.parquet").reindex()
showtxt(reviews.head(5))

In [None]:
#english = spacy.load('en_core_web_sm')
#def sentences(text):
#    return [str(s) for s in english(text).sents]

sdelim = re.compile('(?<=[.!?]) *')
def sentences(text):
    return [s for s in re.split(sdelim, text) if len(s) > 1]

sentiment = vader.SentimentIntensityAnalyzer()

def sentiment_compound(text):
    scores = [sentiment.polarity_scores(s)['compound'] for s in sentences(text)]
    if len(scores) < 1: return 0.0
    return sum(scores) / len(scores)

In [None]:
%%time
reviews["sentiment"] = reviews["text"].apply(sentiment_compound)

In [None]:
%%time
feats1 = reviews.copy().reindex()
feats1["feats"] = feats1.apply(lambda row: np.array([row["score"] / 5.0, row["sentiment"]]), axis=1)
feats1["feats"].sample(5)

In [None]:
feats1["x: score"] = feats1["feats"].apply(lambda x: x[0])
feats1["y: sentiment"] = feats1["feats"].apply(lambda x: x[1])
alt.Chart(feats1.sample(2000)).encode(x="x: score", y="y: sentiment", color="score").mark_point().interactive()

In [None]:
%%time
data = np.array(list(feats1["feats"]))
clustering = KMeans(n_clusters=10).fit(data)

In [None]:
feats1["pred"] = clustering.predict(np.array(list(feats1["feats"])))

In [None]:
feats1["pstr"] = feats1["pred"].apply(str)
alt.Chart(feats1.sample(2000)).encode(x="x: score", y="y: sentiment", color="pstr").mark_point().interactive()

In [None]:
feats1["pdist"] = feats1.apply(lambda row: np.linalg.norm(row["feats"] - clustering.cluster_centers_[row["pred"]]), axis=1)
feats1["pdist"].sample(5)

In [None]:
anomalies = feats1.sort_values(by=["pdist"], ascending=False)[["pdist","sentiment","score","text"]].head(25)
showtxt(anomalies)

In [None]:
def shingles(k):
    def kshingles(doc):
        return [doc[i:i + k] for i in range(len(doc) - k + 1)]
    return kshingles

htmlbr = re.compile('<br />')
whitesp = re.compile('\\s+')
def cleantxt(txt):
    clean = re.sub(htmlbr, ' ', txt)
    clean = re.sub(whitesp, ' ', clean)
    clean = clean.lower()
    return clean

def hashing_frequency(vecsize, h, norm = 1.0):
    def hf(words):
        if type(words) is type(""):
            # handle both lists of words and space-delimited strings
            words = words.split(" ")
        hsig = np.zeros(vecsize, dtype=np.float32)
        for term in [w for w in words if len(w) > 0]:
            hsig[h(term) % vecsize] += 1.0
        z = np.linalg.norm(hsig) / norm
        if (z > 0.0): hsig /= z
        return hsig
    return hf

In [None]:
%%time
sh4 = shingles(4)
hsig = hashing_frequency(512, hash, norm = 1)
feats2 = reviews.copy()
feats2["feats"] = feats2["text"].apply(lambda txt: hsig(sh4(cleantxt(txt))))
feats2["feats"].sample(3)

In [None]:
import sklearn.decomposition

def append_pca_columns(df, featcol, pcacols=["x", "y"]):
    DIMENSIONS = 2
    data = np.array(list(df[featcol]))
    pca2 = sklearn.decomposition.PCA(DIMENSIONS)
    pca = pca2.fit_transform(data)
    pca_df = pd.DataFrame(pca, columns=pcacols)
    df = df.drop(columns=pcacols, errors='ignore')
    df = pd.concat([df, pca_df], axis=1).reindex()
    return df

def pca_features(df, icol, ocol, dimensions=2):
    data = np.array(list(df[icol]))
    pca2 = sklearn.decomposition.PCA(dimensions)
    pca = pca2.fit_transform(data)
    df[ocol] = list(pca)
    return df

In [None]:
feats2 = append_pca_columns(feats2, "feats")
alt.Chart(feats2.sample(2000)).encode(x="x", y="y", color="score").mark_point().interactive()

In [None]:
%%time
data = np.array(list(feats2["feats"]))
clustering = KMeans(n_clusters=10).fit(data)

In [None]:
feats2["pred"] = clustering.predict(np.array(list(feats2["feats"])))
feats2["pstr"] = feats2["pred"].apply(str)
alt.Chart(feats2.sample(2000)).encode(x="x", y="y", color="pstr").mark_point().interactive()

In [None]:
feats2["pdist"] = feats2.apply(lambda row: np.linalg.norm(row["feats"] - clustering.cluster_centers_[row["pred"]]), axis=1)
feats2["pdist"].sample(5)

In [None]:
anomalies = feats2.sort_values(by=["pdist"], ascending=False)[["pdist","score","sentiment","text"]].head(25)
showtxt(anomalies)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer

HVSIZE = 1000
vectorizer = HashingVectorizer(token_pattern='(?u)\\b[A-Za-z]\\w+\\b', n_features = HVSIZE, alternate_sign=False)
hvcounts = vectorizer.fit_transform(reviews["text"].apply(cleantxt))

In [None]:
def normarray(v):
    r = v.toarray().reshape(HVSIZE)
    z = np.linalg.norm(r)
    if (z > 0.0): r /= z
    return r

feats3 = reviews.copy()
feats3["feats"] = [normarray(v) for v in hvcounts]

In [None]:
feats3 = append_pca_columns(feats3, "feats")
alt.Chart(feats3.sample(2000)).encode(x="x", y="y", color="score").mark_point().interactive()

In [None]:
%%time
data = np.array(list(feats3["feats"]))
clustering = KMeans(n_clusters=10).fit(data)

In [None]:
feats3["pred"] = clustering.predict(np.array(list(feats3["feats"])))
feats3["pstr"] = feats3["pred"].apply(str)
alt.Chart(feats3.sample(2000)).encode(x="x", y="y", color="pstr").mark_point().interactive()

In [None]:
feats3["pdist"] = feats3.apply(lambda row: np.linalg.norm(row["feats"] - clustering.cluster_centers_[row["pred"]]), axis=1)
feats3["pdist"].sample(5)

In [None]:
anomalies = feats3.sort_values(by=["pdist"], ascending=False)[["pdist","score","sentiment","text"]].head(25)
showtxt(anomalies)