### Semantic Analysis of Yelp Businesses' Tips
We will be using a json file that's part of Yelp's distributed user reviews and tips on businesses.
This file contains over 900k user tips for businesses across mostly the U.S(Canada as well).
Each tip has an associated user_id, business_id, timestamp, compliment_count from other users.
GOAL: Build a model that generalizes well the sentiment behind each tip/review
We can leverage tips collected from users to enrich our analysis and recommend what's liked and not liked by users

In [1]:
import os
import nltk
import pandas as pd
import contractions
import string

from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
from unidecode import unidecode

# Ignore spurious warnings about URL-looking markup
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

pd.set_option('display.max_colwidth', None) 

dataset_path = 'yelp_dataset'

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

# Get English stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/safarifgisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/safarifgisa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/safarifgisa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
from html.parser import HTMLParser

In [3]:
class TxtStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)
        
    # Normalize special accented characters and 
    # convert them into regular ASCII characters so as to standardize the text across all documents
    def normalize_accented_characters(text):
        text = unicodedata.normalize('NFKD', text.decode('utf-8')).encode('ascii', 'ignore')
        return text
        
def strip_html(text):
    html_stripper = TxtStripper()
    html_stripper.feed(text)
    return html_stripper.get_data() 

In [4]:
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False):
    normalized_corpus = []
    for index, text in enumerate(corpus):
        text = normalize_accented_characters(text)
        text = html_parser.unescape(text)
        text = strip_html(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
            text = remove_special_characters(text)
            text = remove_stopwords(text)
            if only_text_chars:
                text = keep_text_characters(text)
            if tokenize:
                text = tokenize_text(text)
                normalized_corpus.append(text)
            else:
                normalized_corpus.append(text)
    return normalized_cor

### Load user tips

In [5]:
tips_df = pd.read_json(f'{dataset_path}/yelp_academic_dataset_tip.json', lines=True)
tips_df.shape

(908915, 5)

In [6]:
tips_df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban sandwiches,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


In [7]:
# 1. Remove HTML tags
def strip_html_tags_and_whitespace(text):
    soup = BeautifulSoup(text, "html.parser")
    if bool(soup.find()):
        [s.extract() for s in soup(['iframe', 'script'])]
        stripped_text = soup.get_text()
        stripped_text = re.sub(r'\r\n?|\n', '\n', stripped_text)
    else:
        stripped_text = text
    return stripped_text

tips_cleaned_df = tips_df.copy()
tips_cleaned_df['tip_cleaned'] = tips_cleaned_df.text.apply(lambda t: strip_html_tags_and_whitespace(t))

## Text processing
### Remove multiple whitespace
We remove all leading and trailing whitespace and replace multiple whitespace with just a single whitespace. Replace '\t', '\n', '\r' with one whitespace.

In [8]:
# 2. Trim and collapse multiple whitespaces
tips_cleaned_df['tip_cleaned'] = tips_cleaned_df.tip_cleaned.str.replace(r"\s+", " ", regex=True)
tips_cleaned_df['tip_cleaned'] = tips_cleaned_df['tip_cleaned'].str.strip()

In [9]:
tips_cleaned_df[tips_cleaned_df.text.str.contains('\n')].head(5)

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned
79,bsNS8tvMDn9ntB8OP3rNsw,i1A9_CvPb0SZ_5nTddCEgA,"Great hoagies, great bread! Cutlets....meh\n\nMy go to sandwich: A Cut Above Italian w/ oil/vinegar, l/t/o, hot peppers, sweet peppers, & pickles",2014-03-13 16:39:55,0,"Great hoagies, great bread! Cutlets....meh My go to sandwich: A Cut Above Italian w/ oil/vinegar, l/t/o, hot peppers, sweet peppers, & pickles"
106,2GqQGWRmasX8qDuG7CDiWg,pTMi7h7JMtwSsijdx_BPDg,Probably the best pizza in the area.\nOn Wednesday 4-8 the large round pies are half price,2017-06-22 12:04:47,0,Probably the best pizza in the area. On Wednesday 4-8 the large round pies are half price
112,9r0RoGmf7211qmiLZsgYeg,WMkiheTT-8kRslImVLWMVw,"The all local beers is a def plus!\nI had the garlic shrimp with garlic bread and my wife had the duck salad. The portions are decent and the duck was excellent, nice crispy but not burnt skin and the meat was perfect not overdone. My shrimp was good, a bit over seasoned, however not too much, nice size pieces and really good with the sauce. I would def go back, it's worth a look!",2013-06-20 23:50:52,0,"The all local beers is a def plus! I had the garlic shrimp with garlic bread and my wife had the duck salad. The portions are decent and the duck was excellent, nice crispy but not burnt skin and the meat was perfect not overdone. My shrimp was good, a bit over seasoned, however not too much, nice size pieces and really good with the sauce. I would def go back, it's worth a look!"
175,gYeiI_21LzFWidjHbNkiyQ,ikONTzFKwachQtFtoMGeIQ,Pint and pizza slice for\n$5.95 is the way to go!,2015-05-23 23:52:44,0,Pint and pizza slice for $5.95 is the way to go!
191,kKPbWlBjpSL3Qzvtn_AaPw,j4kYliTkKf7k-oDqVgyXYA,Mandrakes & Tree-nuts\nCollege Football Championship\nBig Lights of Texas\n\n#NOLA_Haiku haiku\n#OSUvsORE #Buckeyes #NCAA\n#NationalChampionship\n#THEOhioStateUniversity\n#ATTStadium #OSUBuckeyes,2015-01-13 03:44:02,0,Mandrakes & Tree-nuts College Football Championship Big Lights of Texas #NOLA_Haiku haiku #OSUvsORE #Buckeyes #NCAA #NationalChampionship #THEOhioStateUniversity #ATTStadium #OSUBuckeyes


#### Replaced accented characters
This step is recommended when using traditional ML NLP models, where as Deep Leearning models or embeddings would be fine without this step. We'll use it for now! (e.g "café" → "cafe")

In [10]:
# 3. Normalize accented characters
tips_cleaned_df.tip_cleaned = tips_cleaned_df.tip_cleaned.apply(unidecode)

#### 4. Contraction Expansion
The next step is to expand contractions, meaning words like 'He's' -> 'He is', 'Don't' -> 'do not'. We need consistency and uniformity in our tokens which helps during lemmatization and interpreting negations when we get to extracting meaning and topics from user reviews.

In [11]:
tips_cleaned_df['tip_cleaned'] = tips_cleaned_df['tip_cleaned'].apply(lambda tip: contractions.fix(tip))
tips_cleaned_df.head(1)

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,Avengers time with the ladies.


#### Lower casing
Next step is to lower all text to help with tokenization and other downstream processes

In [12]:
# Lower all tips text
tips_cleaned_df['tip_cleaned'] = tips_cleaned_df['tip_cleaned'].str.lower()
tips_cleaned_df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,avengers time with the ladies.
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban sandwiches,2013-02-05 18:35:10,0,they have lots of good deserts and tasty cuban sandwiches
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0,it is open even when you think it is not
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0,very decent fried chicken
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0,appetizers.. platter special for lunch


#### Tokenization
I am interested in more nuanced, or sentence theme extraction, thus keeping sentence tokens. Word tokenization is performed on each sentence as opposed to the whole tip. Extracting words from user reviews is what we call Tokenization. 

In [13]:
# Word tokenizer
tips_cleaned_df['sent_tokens'] = tips_cleaned_df.tip_cleaned.apply(lambda t: sent_tokenize(t))
tips_cleaned_df['sent_word_tokens'] = tips_cleaned_df.sent_tokens.apply(lambda sents: [word_tokenize(sent) for sent in sents])

tips_cleaned_df.head(20)

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned,sent_tokens,sent_word_tokens
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,avengers time with the ladies.,[avengers time with the ladies.],"[[avengers, time, with, the, ladies, .]]"
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban sandwiches,2013-02-05 18:35:10,0,they have lots of good deserts and tasty cuban sandwiches,[they have lots of good deserts and tasty cuban sandwiches],"[[they, have, lots, of, good, deserts, and, tasty, cuban, sandwiches]]"
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0,it is open even when you think it is not,[it is open even when you think it is not],"[[it, is, open, even, when, you, think, it, is, not]]"
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0,very decent fried chicken,[very decent fried chicken],"[[very, decent, fried, chicken]]"
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0,appetizers.. platter special for lunch,[appetizers.. platter special for lunch],"[[appetizers, .., platter, special, for, lunch]]"
5,trf3Qcz8qvCDKXiTgjUcEg,7Rm9Ba50bw23KTA8RedZYg,"Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.",2012-03-13 04:00:52,0,"chili cup + single cheeseburger with onion, pickle, and relish + vanilla coca-cola...so far.","[chili cup + single cheeseburger with onion, pickle, and relish + vanilla coca-cola...so far.]","[[chili, cup, +, single, cheeseburger, with, onion, ,, pickle, ,, and, relish, +, vanilla, coca-cola, ..., so, far, .]]"
6,SMGAlRjyfuYu-c-22zIyOg,kH-0iXqkL7b8UXNpguBMKg,"Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!",2013-12-03 23:42:15,0,"saturday, dec 7th 2013, ride patco's silver sleigh w/ santa & his elves on a decorated train into center city. trains leave from lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. great for kids!","[saturday, dec 7th 2013, ride patco's silver sleigh w/ santa & his elves on a decorated train into center city., trains leave from lindenwold at 10am, 11:15am, & 12:30pm, and make all stops., great for kids!]","[[saturday, ,, dec, 7th, 2013, ,, ride, patco, 's, silver, sleigh, w/, santa, &, his, elves, on, a, decorated, train, into, center, city, .], [trains, leave, from, lindenwold, at, 10am, ,, 11:15am, ,, &, 12:30pm, ,, and, make, all, stops, .], [great, for, kids, !]]"
7,YVBB9g23nuVJ0u44zK0pSA,jtri188kuhe_AuEOJ51U_A,This is probably the best place in the cool Springs area to watch a game and eat,2016-11-22 22:14:58,0,this is probably the best place in the cool springs area to watch a game and eat,[this is probably the best place in the cool springs area to watch a game and eat],"[[this, is, probably, the, best, place, in, the, cool, springs, area, to, watch, a, game, and, eat]]"
8,VL12EhEdT4OWqGq0nIqkzw,xODBZmX4EmlVvbqtKN7YKg,Tacos,2012-07-27 01:48:24,0,tacos,[tacos],[[tacos]]
9,4ay-fdVks5WMerYL_htkGQ,pICJRcyqW1cF96Q3XhLSbw,Starbucks substitute in boring downtown Tampa. Ugh. Never again!,2012-06-09 22:57:04,0,starbucks substitute in boring downtown tampa. ugh. never again!,"[starbucks substitute in boring downtown tampa., ugh., never again!]","[[starbucks, substitute, in, boring, downtown, tampa, .], [ugh, .], [never, again, !]]"


#### Lemmatize tokens
Next step, is to compute lemmas of the clean tokens we created from each user review. Lemmatization finds the base form, as opposed to stemming which finds the root form of a word. The meaning of a word is important to semantic analysis, therefore, we use POS tags to enhance lemmatization by identifying the nature of the word.

In [16]:
# Map POS tags (nltk) to WordNet POS tags
def map_to_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()

def lemmatize_sent_word_tagged_tokens(sent_word_tokens):
    lemmatized_sentences = []
    for sent in sent_word_tokens:
        pos_tags = pos_tag(sent)
        lemmas = (lemmatizer.lemmatize(token, map_to_wordnet_pos(token)) for token, tag in pos_tags)
        lemmatized_sentences.append(list(lemmas))
    return lemmatized_sentences

# Lemmatize sent_word_tokens
tips_cleaned_df['lemmas'] = tips_cleaned_df.sent_word_tokens.apply(lemmatize_sent_word_tagged_tokens)

In [17]:
tips_cleaned_df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned,sent_tokens,sent_word_tokens,lemmas
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,avengers time with the ladies.,[avengers time with the ladies.],"[[avengers, time, with, the, ladies, .]]","[[avenger, time, with, the, lady, .]]"
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban sandwiches,2013-02-05 18:35:10,0,they have lots of good deserts and tasty cuban sandwiches,[they have lots of good deserts and tasty cuban sandwiches],"[[they, have, lots, of, good, deserts, and, tasty, cuban, sandwiches]]","[[they, have, lot, of, good, desert, and, tasty, cuban, sandwich]]"
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0,it is open even when you think it is not,[it is open even when you think it is not],"[[it, is, open, even, when, you, think, it, is, not]]","[[it, is, open, even, when, you, think, it, is, not]]"
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0,very decent fried chicken,[very decent fried chicken],"[[very, decent, fried, chicken]]","[[very, decent, fried, chicken]]"
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0,appetizers.. platter special for lunch,[appetizers.. platter special for lunch],"[[appetizers, .., platter, special, for, lunch]]","[[appetizer, .., platter, special, for, lunch]]"


#### Stopword removal
Lemmas produced from the step above, contain unimportant words and punctuation that isn't important to our analysis. Therefore, we need to remove careful all words in the set of stopwords from nltk. However, negative is an important subset that we need.

In [22]:
# keep these negations
negations = {'not', 'no', 'non', 'never', 'none', 'nobody', 'nothing', 'neither'}
stop_words = set(stopwords.words('english')) - negations

# Remove non alphabetic lemmas
tips_cleaned_df['lemmas_no_stop'] = tips_cleaned_df.lemmas.apply(
    lambda lemmas_sents: [[lemma for lemma in lemmas_sent if lemma not in stop_words and lemma.isalpha()] for lemmas_sent in lemmas_sents])

In [24]:
tips_cleaned_df.head(30)

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_cleaned,sent_tokens,sent_word_tokens,lemmas,lemmas_no_stop
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,avengers time with the ladies.,[avengers time with the ladies.],"[[avengers, time, with, the, ladies, .]]","[[avenger, time, with, the, lady, .]]","[[avenger, time, lady]]"
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban sandwiches,2013-02-05 18:35:10,0,they have lots of good deserts and tasty cuban sandwiches,[they have lots of good deserts and tasty cuban sandwiches],"[[they, have, lots, of, good, deserts, and, tasty, cuban, sandwiches]]","[[they, have, lot, of, good, desert, and, tasty, cuban, sandwich]]","[[lot, good, desert, tasty, cuban, sandwich]]"
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0,it is open even when you think it is not,[it is open even when you think it is not],"[[it, is, open, even, when, you, think, it, is, not]]","[[it, is, open, even, when, you, think, it, is, not]]","[[open, even, think, not]]"
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0,very decent fried chicken,[very decent fried chicken],"[[very, decent, fried, chicken]]","[[very, decent, fried, chicken]]","[[decent, fried, chicken]]"
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0,appetizers.. platter special for lunch,[appetizers.. platter special for lunch],"[[appetizers, .., platter, special, for, lunch]]","[[appetizer, .., platter, special, for, lunch]]","[[appetizer, platter, special, lunch]]"
5,trf3Qcz8qvCDKXiTgjUcEg,7Rm9Ba50bw23KTA8RedZYg,"Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.",2012-03-13 04:00:52,0,"chili cup + single cheeseburger with onion, pickle, and relish + vanilla coca-cola...so far.","[chili cup + single cheeseburger with onion, pickle, and relish + vanilla coca-cola...so far.]","[[chili, cup, +, single, cheeseburger, with, onion, ,, pickle, ,, and, relish, +, vanilla, coca-cola, ..., so, far, .]]","[[chili, cup, +, single, cheeseburger, with, onion, ,, pickle, ,, and, relish, +, vanilla, coca-cola, ..., so, far, .]]","[[chili, cup, single, cheeseburger, onion, pickle, relish, vanilla, far]]"
6,SMGAlRjyfuYu-c-22zIyOg,kH-0iXqkL7b8UXNpguBMKg,"Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!",2013-12-03 23:42:15,0,"saturday, dec 7th 2013, ride patco's silver sleigh w/ santa & his elves on a decorated train into center city. trains leave from lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. great for kids!","[saturday, dec 7th 2013, ride patco's silver sleigh w/ santa & his elves on a decorated train into center city., trains leave from lindenwold at 10am, 11:15am, & 12:30pm, and make all stops., great for kids!]","[[saturday, ,, dec, 7th, 2013, ,, ride, patco, 's, silver, sleigh, w/, santa, &, his, elves, on, a, decorated, train, into, center, city, .], [trains, leave, from, lindenwold, at, 10am, ,, 11:15am, ,, &, 12:30pm, ,, and, make, all, stops, .], [great, for, kids, !]]","[[saturday, ,, dec, 7th, 2013, ,, ride, patco, 's, silver, sleigh, w/, santa, &, his, elf, on, a, decorated, train, into, center, city, .], [train, leave, from, lindenwold, at, 10am, ,, 11:15am, ,, &, 12:30pm, ,, and, make, all, stop, .], [great, for, kid, !]]","[[saturday, dec, ride, patco, silver, sleigh, santa, elf, decorated, train, center, city], [train, leave, lindenwold, make, stop], [great, kid]]"
7,YVBB9g23nuVJ0u44zK0pSA,jtri188kuhe_AuEOJ51U_A,This is probably the best place in the cool Springs area to watch a game and eat,2016-11-22 22:14:58,0,this is probably the best place in the cool springs area to watch a game and eat,[this is probably the best place in the cool springs area to watch a game and eat],"[[this, is, probably, the, best, place, in, the, cool, springs, area, to, watch, a, game, and, eat]]","[[this, is, probably, the, best, place, in, the, cool, spring, area, to, watch, a, game, and, eat]]","[[probably, best, place, cool, spring, area, watch, game, eat]]"
8,VL12EhEdT4OWqGq0nIqkzw,xODBZmX4EmlVvbqtKN7YKg,Tacos,2012-07-27 01:48:24,0,tacos,[tacos],[[tacos]],[[taco]],[[taco]]
9,4ay-fdVks5WMerYL_htkGQ,pICJRcyqW1cF96Q3XhLSbw,Starbucks substitute in boring downtown Tampa. Ugh. Never again!,2012-06-09 22:57:04,0,starbucks substitute in boring downtown tampa. ugh. never again!,"[starbucks substitute in boring downtown tampa., ugh., never again!]","[[starbucks, substitute, in, boring, downtown, tampa, .], [ugh, .], [never, again, !]]","[[starbucks, substitute, in, boring, downtown, tampa, .], [ugh, .], [never, again, !]]","[[starbucks, substitute, boring, downtown, tampa], [ugh], [never]]"


#### Rename text column
Less confusing than 'text' we use in the user reviews file.

In [None]:
# Rename text column
tips_renamed_df = tips_df.rename(columns={'text':'tip'})
tips_renamed_df.shape

#### Find duplicates

In [None]:
duplicates = tips_renamed_df.duplicated()
tips_renamed_df[duplicates].shape

#### Drop duplicates and keep first occurrence

In [None]:
tips_no_dups_df = tips_renamed_df.drop_duplicates(keep='first').copy()
tips_no_dups_df.shape

In [None]:
duplicates = tips_no_dups_df.duplicated()
tips_no_dups_df[duplicates].shape

#### Calculate Tip length

In [None]:
# Tips length analysis
tips_no_dups_df["tip_length"] = tips_no_dups_df.tip.apply(lambda t: len(t))
tips_no_dups_df[['tip', 'tip_length']].sort_values('tip_length', ascending=False).head()

In [None]:
tips_lemmatized_df.columns