In [0]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [0]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [0]:
places_df = pd.read_csv('tourpedia_London_poi.csv')
reviews_df = pd.read_csv('tourpedia_London_reviews.csv')

In [164]:
reviews_df.head()

Unnamed: 0,language,text,time,id,originalId,details
0,en,Awesome food,2011-05-06,35306,4ac518bef964a52005a320e3,http://tour-pedia.org/api/getPlaceDetails?id=3...
1,en,Awesome food,2011-05-06,35306,4ac518bef964a52005a320e3,http://tour-pedia.org/api/getPlaceDetails?id=3...
2,en,Why is this marked as a medical center? Haha i...,2011-12-29,35306,4ac518bef964a52005a320e3,http://tour-pedia.org/api/getPlaceDetails?id=3...
3,ca,Its a pub / restaurant,2012-08-03,35306,4ac518bef964a52005a320e3,http://tour-pedia.org/api/getPlaceDetails?id=3...
4,en,Sadly they got the order badly wrong. They mad...,2013-09-24,35306,4ac518bef964a52005a320e3,http://tour-pedia.org/api/getPlaceDetails?id=3...


In [165]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10934 entries, 0 to 10933
Data columns (total 6 columns):
language      10934 non-null object
text          10934 non-null object
time          10934 non-null object
id            10934 non-null int64
originalId    10934 non-null object
details       10934 non-null object
dtypes: int64(1), object(5)
memory usage: 512.6+ KB


In [166]:
reviews_df.keys()

Index(['language', 'text', 'time', 'id', 'originalId', 'details'], dtype='object')

## Remove all non-English review rows (keep only rows where language=en)

In [167]:
reviews_df['language'].unique()

array(['en', 'ca', 'nl', 'es', 'ja', 'it', 'pt', 'ru', 'af', 'ar', 'ro',
       'fr', 'no', 'ko', 'da', 'lt', 'lv', 'de', 'tr', 'pl', 'vi', 'hr',
       'et', 'eu', 'id', 'so', 'fa', 'False', 'tl', 'bg', 'sv', 'sq',
       'sw', 'cs', 'fi', 'sl', 'hu', 'th', 'zh-cn', 'sk', 'el'],
      dtype=object)

In [168]:
reviews_df.shape

(10934, 6)

In [169]:
reviews_df = reviews_df[reviews_df['language'] == 'en']
reviews_df.shape

(10127, 6)

In [170]:
reviews_df['language'].unique()

array(['en'], dtype=object)

## Convert text to lowercase

In [171]:
sent = 'I wish I could fly free high in the sky'
sent = sent.lower()
sent

'i wish i could fly free high in the sky'

In [0]:
reviews_df['clean_reviews'] = reviews_df['text'].apply(lambda text: text.lower())

In [173]:
reviews_df['clean_reviews'].head()

0                                         awesome food
1                                         awesome food
2    why is this marked as a medical center? haha i...
4    sadly they got the order badly wrong. they mad...
5    did u know the author of the windows in the wi...
Name: clean_reviews, dtype: object

## Remove numbers from sentence

In [174]:
text = "There was 200 people standing right next to me at 2pm."
''.join(c for c in text if not c.isdigit())

'There was  people standing right next to me at pm.'

## Tokenization (Convert sentences into word tokens)

In [175]:
sent = word_tokenize(sent)
sent

['i', 'wish', 'i', 'could', 'fly', 'free', 'high', 'in', 'the', 'sky']

In [176]:
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.$$  12937Onward!')

['Eighty', 'seven', 'miles', 'to', 'go', 'yet', '12937Onward']

In [177]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
tokenizer.tokenize('Eighty-seven miles to go, yet.$$  %%) ~`12937Onward!')


['Eighty', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']

In [0]:
# Remove punctuation, special characters, numbers. Keep only alphabets. Convert sentence into tokens
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
reviews_df['clean_reviews'] = reviews_df['clean_reviews'].apply(lambda sent: tokenizer.tokenize(sent))

In [179]:
reviews_df['clean_reviews'].head()

0                                      [awesome, food]
1                                      [awesome, food]
2    [why, is, this, marked, as, a, medical, center...
4    [sadly, they, got, the, order, badly, wrong, t...
5    [did, u, know, the, author, of, the, windows, ...
Name: clean_reviews, dtype: object

## Remove Stopwords

In [180]:
# Load stop words
stop_words = stopwords.words('english')
stop_words.append('u')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [181]:
sent = [word for word in sent if word not in stop_words]
sent

['wish', 'could', 'fly', 'free', 'high', 'sky']

In [0]:
reviews_df['clean_reviews'] = reviews_df['clean_reviews'].apply(lambda sent: [word for word in sent if word not in stop_words])

In [183]:
reviews_df['clean_reviews'].head()

0                                      [awesome, food]
1                                      [awesome, food]
2          [marked, medical, center, haha, restaurant]
4    [sadly, got, order, badly, wrong, made, offeri...
5    [know, author, windows, willows, secretary, ba...
Name: clean_reviews, dtype: object

## Lemmatization (Let's not use it for this project!)

In [0]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatize Single Word
lemmatizer.lemmatize("stripes")

'stripe'

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/