## Natural Language Processing Template

***

## Project Description

The goal of this task is to explore the Yelp data set to get a sense about what the data look like and their characteristics. 

### Import Libraries

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import json
import random
import re
import string
from string import punctuation


#sets the default autosave frequency in seconds
%autosave 60

# import wordcloud
# from wordcloud import WordCloud, ImageColorGenerator

# SPACY
# import spacy
# from spacy.lang.en.stop_words import STOP_WORDS as stopwords

# NLTK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
# from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
# from nltk.probability import FreqDist
# from nltk.util import pad_sequence
# from nltk.util import bigrams
# from nltk.util import ngrams
# from nltk.util import everygrams
# from nltk.lm.preprocessing import pad_both_ends
# from nltk.lm.preprocessing import flatten
# from nltk.lm import MLE
# from nltk.lm.preprocessing import padded_everygram_pipeline

import textblob
from textblob import TextBlob

# PyCaret
#from pycaret.nlp import *


pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 10000)
pd.set_option('display.float_format','{:.2f}'.format)

random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


### Extract Data from JSON

In [None]:
# business = pd.read_json("yelp_academic_dataset_business.json", lines=True)
# business

In [None]:
# buss2 = business.sample(n=1000, random_state=0)
# buss2

In [None]:
# buss2.reset_index(inplace=True, drop=True)

In [None]:
# buss2

In [None]:
#buss2.to_csv("business.csv", index=False)

In [None]:
#checkin = pd.read_json("yelp_academic_dataset_checkin.json", lines=True)

In [None]:
#checkin

In [None]:
# checkin2 = checkin.sample(n=1000, random_state=0)
# checkin2

In [None]:
#checkin2.reset_index(inplace=True, drop=True)

In [None]:
#checkin2

In [None]:
#checkin2.to_csv("checkin.csv", index=False)

In [None]:
#review = pd.read_json("yelp_academic_dataset_review.json", lines=True)

In [None]:
#review

In [None]:
# review2 = review.sample(n=1000, random_state=0)
# review2

In [None]:
#review2.reset_index(inplace=True, drop=True)

In [None]:
#review2

In [None]:
#review2.to_csv("review.csv", index=False)

In [None]:
#tip = pd.read_json("yelp_academic_dataset_tip.json", lines=True)

In [None]:
#tip

In [None]:
# tip2 = tip.sample(n=1000, random_state=0)
# tip2

In [None]:
#tip2.reset_index(inplace=True, drop=True)

In [None]:
#tip2

In [None]:
#tip2.to_csv("tip.csv", index=False)

In [None]:
#user = pd.read_json("yelp_academic_dataset_user.json", lines=True)

In [None]:
#user

In [None]:
# user2 = user.sample(n=1000, random_state=0)
# user2

In [None]:
#user2.reset_index(inplace=True, drop=True)

In [None]:
#user2

In [None]:
#user2.to_csv("user.csv", index=False)

### Data Exploration

In [2]:
df = pd.read_csv("review.csv")

In [3]:
df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 7, 'useful': 10, 'cool': 8}",du6KeE54IFbPiXpU3LOd1g,nSFIRge1aJCMcwUqlaF-0g,5,2010-08-01,I have a secret and it's a blockbuster!\n\nI k...,review,De5urGVv1kkiKVJ9bvlQ6w
1,"{'funny': 1, 'useful': 3, 'cool': 0}",uZHynuKHcdM-VSa5yUHrtw,OkAX8ATtZIJvpoOHFIFeQg,5,2009-09-09,I don't understand why there aren't much revie...,review,3KAfAz5xzjop5RmyYLW4Sg
2,"{'funny': 0, 'useful': 0, 'cool': 0}",SEDJTWEzMdqp7UsS1W3KXw,LphhvJCa9sLopZfO6CbpTw,3,2013-11-17,Had the ham and cheese and it was good but def...,review,DBoebGeuz91QAP3tSFYs6w
3,"{'funny': 0, 'useful': 0, 'cool': 0}",5O4x5KS4hpQj0q21-Uh0WQ,iALLfzVXz47QlWUOf3yT-g,5,2012-05-18,"My review concerns the store, in general - Not...",review,U5-VseaV5NWnuh5AW4acJg
4,"{'funny': 1, 'useful': 2, 'cool': 1}",0WRo9NFL10av3eQVMR4Egg,TGJDMSIyeJJvtsNhWfGBzQ,3,2013-10-07,"The ""check in"" area is very confusing. Signs d...",review,6LM_Klmp3hOP0JmsMCKRqQ


In [None]:
df.info()

In [None]:
df.describe(include='all')

In [4]:
df2 = df[["text"]].copy()

In [5]:
df2.head()

Unnamed: 0,text
0,I have a secret and it's a blockbuster!\n\nI k...
1,I don't understand why there aren't much revie...
2,Had the ham and cheese and it was good but def...
3,"My review concerns the store, in general - Not..."
4,"The ""check in"" area is very confusing. Signs d..."


In [None]:
df2["length"] = df2["text"].apply(len)

In [None]:
df2

In [None]:
df2["word_count"] = df2["text"].apply(lambda x: len(x.split()))

In [None]:
df2.head()

In [None]:
df2.describe()

### Cleaning Data

In [6]:
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

In [7]:
# Remove punctuations
df2['text'] = df2['text'].apply(punctuation_removal)

In [8]:
# Lowercase characters
df2["text"] = df2["text"].apply(lambda x: x.lower())

In [9]:
# Remove string numbers

df2["text"] = df2["text"].str.replace("\d+","")

  df2["text"] = df2["text"].str.replace("\d+","")


In [18]:
# Remove stopwords
stop = stopwords.words("english")
stop_words = []


#df2["text"] = df2["text"].apply(word for word in text_tokens if not word in stopwords.words() )

TypeError: expected string or bytes-like object

In [None]:
# def remove_special_characters(text):
#     pat = r'[^a-ZA-z0-9]'
#     return re.sub(pat, ' ', text)

In [11]:
df2.head()

Unnamed: 0,text
0,i have a secret and its a blockbuster\n\ni kno...
1,i dont understand why there arent much reviews...
2,had the ham and cheese and it was good but def...
3,my review concerns the store in general not t...
4,the check in area is very confusing signs dont...


In [None]:
# Tokenize Text
df2["tokenize_text"] = df2["text"].apply(nltk.word_tokenize) 

In [None]:
df2

In [10]:
df2.to_csv("test.csv", index=False)