# Phase 2: Break It

In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# install checklist
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

#!python3 -m spacy download en_core_web_sm
# install spacy
import spacy

import json

In [2]:
TRAIN = '../data/interim/train.csv'

In [3]:
data_train = pd.read_json('../data/raw/music_reviews_train.json.gz', lines=True)
data_train['sentiment']

0        positive
1        negative
2        negative
3        positive
4        positive
           ...   
99995    positive
99996    negative
99997    positive
99998    negative
99999    negative
Name: sentiment, Length: 100000, dtype: object

In [4]:
labels = data_train['sentiment'].tolist()

In [5]:
input_data = []
label = [] 
c = 0
n = 100 # set how many training examples to use.
with open(TRAIN, mode='r') as file:
    csvFile = csv.reader(file)
    for lines in csvFile:
        if c < n:
            input_data.append(lines[0])
            label.append(int(lines[1]))
            c+=1
        else:
            break

In [6]:
def tkn(sentence):
    """Function to find all tokens in a given sentence
    """
    tok = re.compile('[\'\"]|[A-Za-z]+|[.?!:\'\"]+')
    
    return tok.findall(sentence)
    
def splitsies(para):
    punct = re.compile('[.?!:]')
    t = punct.split(para)
    spl= []
    for i in t:
        temp = tkn(i)
        if len(temp) > 0:
            spl.append(temp)
        
    return spl
    
#splitsies(input_data[0])

In [7]:
print(f'Sentence 1:\n{input_data[0]}\nLabel: {label[0]}')
type(input_data)

Sentence 1:
Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!
Label: 1


list

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

token_text = []
for para in input_data:
    token_text.append(sent_tokenize(para))
token_text[0]

[nltk_data] Downloading package punkt to /home/dmdequin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Gotta listen to this!',
 'So creative!',
 'Love his music - the words, the message!',
 'Some of my favorite songs on this CD.',
 'I should have bought it years ago!']

In [9]:
nlp = spacy.load('en_core_web_sm')

# Add Typos

In [10]:
ret = Perturb.perturb(input_data, Perturb.add_typos)
ret.data[0]

['Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!',
 'Gotta listen to this! So creative!  oLve his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!']

In [11]:
# This just typos one sentence in input paragraph
typo = []
for thing in ret.data:
    typo.append(thing[1])
typo[0]
#typo

'Gotta listen to this! So creative!  oLve his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!'

In [12]:
# If "sentence" in para is only 1 char, add ".." to avoid errors
for i in range(len(token_text)):
    for j in range(len(token_text[i])):
        if len(token_text[i][j]) <= 1:
            token_text[i][j] = token_text[i][j]+'..'
            
token_text[6]

['Five Stars I love all of his music!', '!..']

In [13]:
# Typo every sentence in every paragraph
typoed = []
lala = []
for i in range(len(token_text)):
    ret = Perturb.perturb(token_text[i], Perturb.add_typos)
    typos = []
    for sent in ret.data:
        typos.append(sent[1])
        lala.append(sent[1])
    typoed.append(typos)
typoed[6]
#typoed

['Five Stars I love all of his msuic!', '.!.']

In [14]:
# Convert back to list of paragraphs WOWWWW
tp = []
for i in range(len(typoed)):
    para = ""
    for j in range(len(typoed[i])):
        para = para + typoed[i][j] + " "
    tp.append(para)
#tp

# POS tag data

https://spacy.io/usage/processing-pipelines

When you call nlp on a text, spaCy will tokenize it and then call each component on the Doc, in order. It then returns the processed Doc that you can work with.

In [15]:
pdata = list(nlp.pipe(tp))
#for doc in nlp.pipe(input_data):
    # Do something with the doc here
#    print([(ent.text, ent.label_) for ent in doc.ents])

# Remove End Punctuation

In [16]:
pdata[0], Perturb.strip_punctuation(pdata[0])

(Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago! ,
 'Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago')

In [17]:
ret = Perturb.perturb(pdata, Perturb.punctuation)
no_punct = []
for i in ret.data:
    no_punct.append(i[1])
no_punct[0]
len(no_punct)
no_punct[0]

'Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago'

# Negation

In [18]:
no_punct[0]

'Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago'

In [21]:
pdata = list(nlp.pipe(no_punct))
pdata[0]

Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago

In [22]:
# This negates only 44 sentences
nega = []
count = 0
for i in range(100):
    ret = Perturb.remove_negation(pdata[i])
    if ret == None:  # if nothing changes
        nega.append(pdata[i]) # append original sentence
    else: 
        nega.append(ret) # append negated paragraph
        count +=1
len(nega)

# Negation doesn't really change the sentiment labels, so they remain the same

100

In [23]:
count

45

In [24]:
nega

[Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago,
 Shame Shame This tape can hardly be understood and it was listed for slae as "very good". It' sVERY BAD,
 'Buy theC D. byu the MP3. Buy the C.D oD buy the MP3 album. Download is longer avaliable. But you fin dthat out until after you have purchased it',
 Five Stars I love Dallas Holms music and vioce! ThankY ou!  Iwill be attending all his concerts in heaven, forever,
 Five Stars Great emmories of my early years in Christ .,
 Sweet Derams... I have been listenin gto this album set my ENTIRE life (30 years!) I remember hearing this CD on repeat every night as a toddler, and i tbrought me comfort to sleep, as I always had to have some music or a fan to rest. Well worth it for relaxaiton,
 Five Stars I love all of his msuic,
 Love Talbot music very inspiring and since this was a ... Love Talbot music very inspirign and since this was a gift

In [25]:
"""# negate every sent in every para
neg = []
for i in range(100):
    ppdata = []
    
    # break down into sentences
    sentenced = sent_tokenize(str(no_punct[i]))
    pdata = list(nlp.pipe(sentenced))
    para = []
    # for each sentence in para
    for sent in pdata:
        ret = Perturb.remove_negation(sent)
        if ret == None:  # if nothing changes
            para.append(sent) # append original sentence
        else:
            para.append(ret[1]) # append negated sentence
    ppdata.append(para)
    neg.append(ppdata)
neg"""

'# negate every sent in every para\nneg = []\nfor i in range(100):\n    ppdata = []\n    \n    # break down into sentences\n    sentenced = sent_tokenize(str(no_punct[i]))\n    pdata = list(nlp.pipe(sentenced))\n    para = []\n    # for each sentence in para\n    for sent in pdata:\n        ret = Perturb.remove_negation(sent)\n        if ret == None:  # if nothing changes\n            para.append(sent) # append original sentence\n        else:\n            para.append(ret[1]) # append negated sentence\n    ppdata.append(para)\n    neg.append(ppdata)\nneg'

In [26]:
"""ret = Perturb.perturb(pdata, Perturb.remove_negation)
negated = []
for i in ret.data:
    negated.append(i[1])
negated[0]
len(negated)
pdata[0]"""

'ret = Perturb.perturb(pdata, Perturb.remove_negation)\nnegated = []\nfor i in ret.data:\n    negated.append(i[1])\nnegated[0]\nlen(negated)\npdata[0]'

# Make Final Output

In [30]:
output = []
for i in range(len(input_data)):
    dicti = {}
    dicti['reviewText'] = str(nega[i])
    dicti['sentiment'] = labels[i]
    dicti['category'] = "{'typos', 'punct', 'negation'}"
    output.append(dicti)
len(output)
print(output[0])

{'reviewText': 'Gott alisten to this! So creatiev! Love ihs music - the words, the message! Some of my favorite songs on thsi CD. Is hould have bought it years ago', 'sentiment': 'positive', 'category': "{'typos', 'punct', 'negation'}"}


In [31]:
#with open('../data/predictions/sanna_dump.json', 'w') as file:
#    json.dump(output , fout)

In [32]:
test_json = [json.dumps(i)+'\n' for i in output]
with open ('../data/predictions/dee_dump.json', 'w') as file:
    file.writelines(test_json)