In [15]:
import spacy
import pandas as pd
import json
import csv

## 1. Load Raw Dataset

In [2]:
with open('reviews.json') as f:
    text = json.load(f)

In [3]:
text["reviews"][0]

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

In [4]:
text_df = pd.DataFrame(text["reviews"], columns=["text"])

In [5]:
text_df.head()

Unnamed: 0,text
0,Total bill for this horrible service? Over $8G...
1,I *adore* Travis at the Hard Rock's new Kelly ...
2,I have to say that this office really has it t...
3,Went in for a lunch. Steak sandwich was delici...
4,Today was my second out of three sessions I ha...


## 2. Tokenize Raw Data

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
word_pieces_df = text_df["text"].loc[:2000].apply(lambda x: [token.text for token in nlp(" ".join(x.split()))])

In [12]:
word_pieces_df

0       [Total, bill, for, this, horrible, service, ?,...
1       [I, *, adore, *, Travis, at, the, Hard, Rock, ...
2       [I, have, to, say, that, this, office, really,...
3       [Went, in, for, a, lunch, ., Steak, sandwich, ...
4       [Today, was, my, second, out, of, three, sessi...
                              ...                        
1996    [It, 's, getting, three, stars, because, of, t...
1997    [I, tried, this, place, once, for, their, poke...
1998    [Been, going, here, the, last, couple, Sundays...
1999    [This, is, hands, down, one, of, the, best, Ja...
2000    [Based, on, the, nam, eighties, place, I, woul...
Name: text, Length: 2001, dtype: object

## 3. Write NER format dataset

In [17]:
with open('reviews.csv', mode='w') as f:
    review = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    review.writerow(["Id", "Word","Tag"])
    for i,wp in enumerate(word_pieces_df):
        for w in wp:
            review.writerow([i, w, "O"])