<a href="https://colab.research.google.com/github/franciscojuarez653/create-your-own-adventure/blob/master/nlp_nonsense_at.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas



In [0]:
import sys, os, re, json
from collections import Counter, OrderedDict
import itertools
from numpy import *
import pandas as pd
# based on http://cs229.stanford.edu/proj2014/Ian%20Tenney,%20A%20General-Purpose%20Sentence-Level%20Nonsense%20Detector.pdf
# and https://github.com/iftenney/nlp-nonsense

In [0]:
# this isn't needed for this demo. but it seemed useful if you wanted to extend the data importing features
class Dataset(object):
    """Dataset object to encapsulate training or test set."""

    df_master = None    # Master featureset
    df_pos = None       # POS distributional
    df_pos_norm = None  # POS distributional, normed
    df_ppos = None      # POS positional

    train = None    # SplitDataset
    test = None     # SplitDataset

    int_to_label = None     # map y -> label
    col_to_feature = None   # map j -> f_name

    def __init__(self, filename):
        records = []
        with open(filename) as fd:
            for line in fd:
                records.append(json.loads(line))

        df = pd.DataFrame.from_records(records, index='__ID__')
        # df['text'] = df.word.map(lambda l: " ".join(l))
        # Handle nested JSON
        df['__LABEL__'] = df['__LABEL__'].map(parse_labels)

        # Disambiguate and binarize labels
        df["__LABELS__"] = df["__LABEL__"]
        df["__LABEL__"] = df["__LABEL__"].map(lambda l: l[0] if len(set(l)) == 1 else None)
        df["__LABEL_BIN__"] = df["__LABEL__"].map(binarize_label)

        # Count sentences and unambiguous labels
        nunamb = len(df[df.__LABEL__.notnull()])
        print( "%d unambiguous labels" % nunamb)
        nsentence = len(df[df.__LABEL_BIN__ == "-SENTENCE-"])
        print( "%d sentences (%.02f%%)" % (nsentence, 100*nsentence/(1.0*nunamb)) )

        # Make basic features
        df = make_basic_features(df)

        self.df_master = df

        print( df.shape)
        # for c in df.columns:
        #     print c

    def make_pos_features(self):
        # Distributional
        pdf = get_pos_counts(self.df_master)

        # L1-normalized distributional
        pdf_norm = pdf.divide(pdf.sum(axis=0))

        # Positional (begin,end token indicators)
        ppdf = get_pos_positionals(self.df_master)

        self.df_pos = pdf
        self.df_pos_norm = pdf_norm
        self.df_ppos = ppdf

    def to_sklearn(self, level=3, splitat=9000, label_col="__LABEL_BIN__"):
        data = self.df_master
        if level >= 2: # merge in normed POS distributional
            data = data.merge(self.df_pos_norm, how='outer',
                              left_index=True, right_index=True)
        if level >= 3:
            data = data.merge(self.df_ppos, how='outer',
                              left_index=True, right_index=True)

        # Skip nulls
        # label_col = "__LABEL_BIN__"
        data = data[data[label_col].notnull()]
        Xy_idx = data.index

        X, y, int_to_label, col_to_feature = dataframe_to_xy(data,
                                                            r"f_.*",
                                                            label_col=label_col)
        print ("X: " + str(X.shape))
        print ("y: " + str(int_to_label))
        print ("Features: " + ", ".join(col_to_feature.values()))

        self.train = SplitDataset(self, X[:splitat], y[:splitat], Xy_idx[:splitat])
        self.test = SplitDataset(self, X[splitat:], y[splitat:], Xy_idx[splitat:])

        self.int_to_label = int_to_label
        self.col_to_feature = col_to_feature

In [0]:
def make_basic_features(df):
    """Compute basic features."""

    df['f_nchars'] = df['__TEXT__'].map(len)
    df['f_nwords'] = df['word'].map(len)

    punct_counter = lambda s: sum(1 for c in s
                                  if (not c.isalnum())
                                      and not c in
                                        [" ", "\t"])
    df['f_npunct'] = df['__TEXT__'].map(punct_counter)
    df['f_rpunct'] = df['f_npunct'] / df['f_nchars']

    df['f_ndigit'] = df['__TEXT__'].map(lambda s: sum(1 for c in s
                                  if c.isdigit()))
    df['f_rdigit'] = df['f_ndigit'] / df['f_nchars']

    upper_counter = lambda s: sum(1 for c in s if c.isupper())
    df['f_nupper'] = df['__TEXT__'].map(upper_counter)
    df['f_rupper'] = df['f_nupper'] / df['f_nchars']

    # fraction named entities recognized (ner) -- 'O' is not recognized
    df['f_nner'] = df['ner'].map(lambda ts: sum(1 for t in ts
                                              if t != 'O'))
    df['f_rner'] = df['f_nner'] / df['f_nwords']

    # Check standard sentence pattern:
    # if starts with capital, ends with .?!
    def check_sentence_pattern(s):
        ss = s.strip(r"""`"'""").strip()
        return s[0].isupper() and (s[-1] in '.?!\n')
    df['f_sentence_pattern'] = df['__TEXT__'].map(check_sentence_pattern)

    # Normalize any LM features
    # by dividing logscore by number of words
    lm_cols = {c:re.sub("_lmscore_", "_lmscore_norm_",c)
               for c in df.columns if c.startswith("f_lmscore")}
    for c,cnew in lm_cols.items():
        df[cnew] = df[c] / df['f_nwords']

    return df
    
    

In [9]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving amazon_alexa.tsv to amazon_alexa.tsv
User uploaded file "amazon_alexa.tsv" with length 514752 bytes


In [10]:
df = pd.read_csv("./amazon_alexa.tsv", sep="\t")
raw = list(df["verified_reviews"])
raw

['Love my Echo!',
 'Loved it!',
 'Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.',
 'I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.',
 'Music',
 'I received the echo as a gift. I needed another Bluetooth or something to play music easily accessible, and found this smart speaker. Can’t wait to see what else it can do.',
 'Without having a cellphone, I cannot use many of her features. I have an iPad but do not see that of any use.  It IS a great alarm.  If u r almost deaf, you can hear her alarm in the bedroom from out in the living room, so that is reason enough to keep her.It is fun to ask random questions to hear her response.  She does not seem to be very smartbon politics yet.',
 "I think this is the 5th one I've

In [16]:
!pip install pandas
!pip install spacy



In [19]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 100.7MB/s 
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25l- \ | done
[?25hSuccessfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [0]:
import spacy
import en_core_web_sm # or en_core_web_lg if need tokenization.
nlp = en_core_web_sm.load()


In [0]:
#x = nlp(raw[-1] + 'fun splot gkesl furd..')
#print(x,list(x.ents),x.sentiment,list(x.noun_chunks))
#print([i.pos_ for i in x])
#print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)
#print(dir(x))

data = []
for idx,text in enumerate(raw):
    doc = nlp(text)
    row = {"__TEXT__": text}
    row['ner'] = [i.pos_ for i in doc]
    row['sentiment'] = doc.sentiment
    row['word'] = [i.text for i in doc]
    data.append(row)
    
df = pd.DataFrame(data)
#df = pd.DataFrame([{"__TEXT__":i, "word": i.split(), 'ner': docs.get(i,[])} for i in raw])

In [0]:
df = make_basic_features(df)

In [24]:
print(df)

                                               __TEXT__  \
0                                         Love my Echo!   
1                                             Loved it!   
2     Sometimes while playing a game, you can answer...   
3     I have had a lot of fun with this thing. My 4 ...   
4                                                 Music   
5     I received the echo as a gift. I needed anothe...   
6     Without having a cellphone, I cannot use many ...   
7     I think this is the 5th one I've purchased. I'...   
8                                           looks great   
9     Love it! I’ve listened to songs I haven’t hear...   
10    I sent it to my 85 year old Dad, and he talks ...   
11    I love it! Learning knew things with it eveyda...   
12    I purchased this for my mother who is having k...   
13                                   Love, Love, Love!!   
14                             Just what I expected....   
15                            I love it, wife hates it. 

In [25]:
"""next: Apply rules
1 Baseline sentence heuristic: first letter is Capitalized, and line ends with one of .?! (1 feature).
2 Number of characters, words, punctuation, digits, and named entities (from Stanford CoreNLP NER tagger), and normalized versions by text length (10 features).
3 Part-of-speech distributional tags: (# / # words) for each Penn treebank tag (45 features).
4 Indicators for the part of speech tag of the first and last token in the text (45x2 = 90 features).
5 Language model raw score (s lm = log p(text)) and normalized score (s¯lm = s lm / # words) (2 features).
"""
cleaned = {}
dirty = {}
bad = 0
for idx,row in df.iterrows():
    if row['f_sentence_pattern'] and (row['f_npunct'] + row['f_nwords']) > 5 and row['f_nner'] > 0:
        cleaned[idx] = row['__TEXT__']
    else:
        dirty[idx] = row['__TEXT__']
        bad += 1
print(bad,len(df))
from pprint import pprint as pp

1219 3150


In [0]:
import sys, os, re, json
from collections import Counter, OrderedDict
import itertools
from numpy import *
import pandas as pd

def parse_labels(ls):
    # Handle badly-serialized labels from java code
    try: return tuple(json.loads(ls))
    except ValueError as e: return (ls,)

def binarize_label(l):
    if l == None: return l
    if "SENTENCE" in l: return "-SENTENCE-"
    else: return "-OTHER-"

from sklearn import preprocessing
class SplitDataset(object):
  master = None   # master Dataset object
  Xy_idx = None   # index into df_*
  X = None        # data vectors
  y = None        # labels
  transformer = None  # preprocessing transformer

  def __init__(self, master, X, y, Xy_idx):
    self.master = master
    self.X = X
    self.y = y
    self.Xy_idx = Xy_idx

  def preprocess(self, transformer=None):
    if transformer != None:
      self.transformer = transformer
      self.X = self.transformer.transform(self.X)
    else: # standard preprocessing
      self.transformer = preprocessing.StandardScaler()
      self.X = self.transformer.fit_transform(self.X)

In [26]:
pp(cleaned)

{2: 'Sometimes while playing a game, you can answer a question correctly but '
    'Alexa says you got it wrong and answers the same as you.  I like being '
    'able to turn lights on and off while away from home.',
 3: 'I have had a lot of fun with this thing. My 4 yr old learns about '
    'dinosaurs, i control the lights and play games like categories. Has nice '
    'sound when playing music as well.',
 5: 'I received the echo as a gift. I needed another Bluetooth or something to '
    'play music easily accessible, and found this smart speaker. Can’t wait to '
    'see what else it can do.',
 6: 'Without having a cellphone, I cannot use many of her features. I have an '
    'iPad but do not see that of any use.  It IS a great alarm.  If u r almost '
    'deaf, you can hear her alarm in the bedroom from out in the living room, '
    'so that is reason enough to keep her.It is fun to ask random questions to '
    'hear her response.  She does not seem to be very smartbon politics y

In [27]:
pp(dirty)

{0: 'Love my Echo!',
 1: 'Loved it!',
 4: 'Music',
 8: 'looks great',
 11: 'I love it! Learning knew things with it eveyday! Still figuring out how '
     "everything works but so far it's been easy to use and understand. She "
     'does make me laugh at times',
 18: 'We love the size of the 2nd generation echo. Still needs a little '
     'improvement on sound',
 19: 'I liked the original Echo. This is the same but shorter and with greater '
     "fabric/color choices. I miss the volume ring on top, now it's just the "
     'plus/minus buttons. Not a big deal but the ring w as comforting. :) '
     'Other than that, well I do like the use of a standard USB charger /port '
     'instead of the previous round pin. Other than that, I guess it sounds '
     'the same, seems to work the same, still answers to Alexa/Echo/Computer. '
     "So what's not to like? :)",
 23: 'I love it. It plays my sleep sounds immediately when I ask',
 25: 'Amazing product',
 29: 'Just like the other one',
 3

In [38]:
datafile = "./amazon_alexa.tsv"
# datafile = "data/mturk-prod-lm.json.tagged.json"
# datafile = "data/mturk-prod.json.tagged.json" # master
dataset = Dataset(datafile)
len(dataset.df_master)
import itertools
# ls = collections.Counter(itertools.chain.from_iterable(dataset.df_master.__LABELS__))
ls = collections.Counter(dataset.df_master.__LABEL__)
norm = 1.0*sum(v for k,v in ls.items() if not k == None)
ls = {k:v/norm for k,v in ls.items() if not k == None}

    
dataset.make_pos_features()
dataset.to_sklearn(level=3, splitat=4000)

dataset.train.preprocess()
dataset.test.preprocess(dataset.train.transformer)

# source = dataset.train
source = cleaned.test

# Baseline heuristic
hidx = [i for (i,f) in dataset.col_to_feature.items() if f == 'f_sentence_pattern'][0]
ypred = source.X[:,hidx] >= 0
y = source.y
res = classifier.standard_scorefunc(y, ypred)
display(res)

R_baseline = res['rec']
P_baseline = res['pre']

JSONDecodeError: ignored