# ABOUT:
- this notebook prepares raw data for annotation
    - each sample of raw data contains one sentence that may or may not contain negation
    - e.g Rick is not here. 
        - contains negation "not"
- these raw data will be loaded into HayStack Annotation Tool for annotation

In [1]:
import os
import pandas as pd

### read negation sentences:
- these sentences all have negation and are derived from:
    - https://www.learngrammar.net/a/examples-of-negation-using-negative-adjectives-adverbs


In [2]:
with open(r"C:\Users\tanch\Documents\NTU\NTU Year 3\Sem 1\CZ4045 Natural Language Processing\Assignment 1\local\data\negation.txt",encoding = "utf8") as f:
    lines = f.readlines()

In [3]:
docs = pd.DataFrame({"document_text":lines})
docs.head(10)

Unnamed: 0,document_text
0,Rick is not here.\n
1,Peter has no books.\n
2,Sam has never been there.\n
3,John did nothing for this project.\n
4,Neither I nor you attended the program.\n
5,None of us liked the movie.\n
6,Pam has rarely cooked any food.\n
7,Richard is buying unnecessary things.\n
8,Rock is not sure about it.\n
9,Patrick has no knowledge about it.\n


### read review data

In [4]:
# read data
import json
data_path = 'C:\\Users\\tanch\\Documents\\NTU\\NTU Year 3\\Sem 1\\CZ4045 Natural Language Processing\\Assignment 1\\local\\data\\reviewSelected100\\reviewSelected100.json'
reviews = []
with open(data_path,"r") as f:
    for l in f.readlines():
        reviews.append(json.loads(l))

In [5]:
reviews_df = pd.DataFrame(reviews)
reviews_df[["review_id","text"]].head()

Unnamed: 0,review_id,text
0,8aoJJdKEO3ypoZNszpPu7Q,We had my Mother's Birthday Party here on 10/2...
1,J5NOCLdhuhor7USRhtYZ8w,Good Korean grill near Eaton Centre. The marin...
2,PXiLWAYRt3xnHaJ8MB4rzw,Was recommended to try this place by few peopl...
3,VrLarvxZYJm74yAqtpe9PQ,Ambience: Would not expect something this nice...
4,C1CUpidlVFprUCkApqzCmA,Absolutely the WORST pool company that I have ...


# Text Segmentation:
- the reviews are too long; we will convert them to sentences using spacy
- we will assume each sentence has 0 or 1 negation only

In [6]:
# spacy for decomposing to sentences
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")
def decompose_to_sentences(text):
    return [s.text.strip() for s in nlp(text).sents]

In [7]:
review_sentences = []
n = 100
for review in reviews_df.sample(n).text:
    review_sentences.extend(decompose_to_sentences(review))

In [8]:
docs = pd.concat([docs,pd.DataFrame({"document_text":review_sentences})]).reset_index(drop=True)

In [9]:
docs.head(20)

Unnamed: 0,document_text
0,Rick is not here.\n
1,Peter has no books.\n
2,Sam has never been there.\n
3,John did nothing for this project.\n
4,Neither I nor you attended the program.\n
5,None of us liked the movie.\n
6,Pam has rarely cooked any food.\n
7,Richard is buying unnecessary things.\n
8,Rock is not sure about it.\n
9,Patrick has no knowledge about it.\n


In [12]:
docs.to_csv(r"C:\Users\tanch\Documents\NTU\NTU Year 3\Sem 1\CZ4045 Natural Language Processing\Assignment 1\local\data\docs.csv",index_label = "document_identifier")