# Baseline model

## Load dataset

In [1]:
import pandas as pd
import csv
import numpy as np

In [25]:
# Load data from CSV file
reviews = pd.read_csv('./data/reviews.csv', 
                      names=[
                          'business_id',
                          'review_id',
                          'user_id',
                          'latitude',
                          'longitude',
                          'region',
                          'name',
                          'postal_code',
                          'city',
                          'state',
                          'neighborhood',
                          'text',
                      ],
                      dtype={
                          'business_id': str,
                          'review_id': str,
                          'text': str,
                          'user_id': str,
                          'city': str,
                          'latitude': np.float32,
                          'longitude': np.float32,
                          'region': str,
                          'name': str,
                          'neighborhood': str,
                          'postal_code': str,
                          'state': str
                      },
                      header=None, 
                      encoding='utf-8',
                      nrows=100000,
#                       skiprows=3000000,
                      sep='|',
                      quoting=csv.QUOTE_MINIMAL,
                      error_bad_lines=False
                     )
print(reviews.shape)

(100000, 12)


In [26]:
reviews.dropna(subset=['text'], inplace=True)
print(reviews.shape)

(100000, 12)


In [213]:
import re
import string
import nltk
from nltk.corpus import stopwords

def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [214]:
## apply the above function to df['text']
reviews['clean_text'] = reviews['text'].map(lambda x: clean_text(x))

## Training a bag of words classifier, with regions as labels

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
# training on just one chunk initially
vect = CountVectorizer(stop_words='english', lowercase=True)
counts = vect.fit_transform(reviews['clean_text'])
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(counts)

# train a model predicting the state from the text
labels_train, labels_test, features_train, features_test = train_test_split(reviews['region'], tfidf, test_size=0.20, random_state=42)
trained_model = LogisticRegression().fit(features_train, labels_train)

In [9]:
predictions = trained_model.predict(features_test)
accuracy = metrics.accuracy_score(labels_test, predictions)
print(accuracy)

0.78913394567


## Exploring baseline models


#### Observation 1: 78% accuracy seems alarmingly high to me, for such a simple model.

In [19]:
print(labels_test.value_counts() / len(labels_test) * 100)

west         67.005835
canada       14.884074
northeast     9.215546
south         6.045030
midwest       2.848514
pnw           0.001000
Name: region, dtype: float64


Aha, so, maybe that explains that. The vast majority of the reviews are from the west. The majority-case classifier would perform at 67% accuracy.

In [38]:
confusion = pd.DataFrame(metrics.confusion_matrix(labels_test, predictions,
                        labels=['west', 'canada', 'northeast', 'south', 'midwest', 'pnw']), 
                        columns=['west', 'canada', 'northeast', 'south', 'midwest', 'pnw'],
                        index=['west', 'canada', 'northeast', 'south', 'midwest', 'pnw'])

def normalize(row):
    return row / row.sum()

confusion = confusion.apply(normalize, axis=0)
confusion

Unnamed: 0,west,canada,northeast,south,midwest,pnw
west,0.779625,0.100364,0.07033,0.054514,0.041237,
canada,0.07209,0.823877,0.027715,0.012266,0.009573,
northeast,0.07247,0.039843,0.846684,0.029302,0.013991,
south,0.052474,0.019448,0.033162,0.896763,0.005155,
midwest,0.023329,0.016467,0.022108,0.007155,0.930044,
pnw,1.2e-05,0.0,0.0,0.0,0.0,


But not so fast! Looking at the confusion matrix, it seems this model performs well in identifying all classes! The lowest accuracy for any one class is actually the majority class, at 77%. The model correctly classifies other classes with better than 80% accuracy. This is pretty suprising to me.

Perhaps we can take a look at the top words used by the model to predict each class, and see if those give us any indication as to why the model is performing so well:

In [93]:
trained_model.classes_

array(['canada', 'midwest', 'northeast', 'south', 'west'], dtype=object)

In [94]:
idx_to_word_map = {v: k for k, v in vect.vocabulary_.items()}

canada_coef = pd.Series(data=trained_model.coef_[0])
canada_top_words_idx = canada_coef.sort_values(ascending=False)[0:50].index.values
canada_top_words = [idx_to_word_map[idx] for idx in canada_top_words_idx]

midwest_coef = pd.Series(data=trained_model.coef_[1])
midwest_top_words_idx = midwest_coef.sort_values(ascending=False)[0:50].index.values
midwest_top_words = [idx_to_word_map[idx] for idx in midwest_top_words_idx]

northeast_coef = pd.Series(data=trained_model.coef_[2])
northeast_top_words_idx = northeast_coef.sort_values(ascending=False)[0:50].index.values
northeast_top_words = [idx_to_word_map[idx] for idx in northeast_top_words_idx]

south_coef = pd.Series(data=trained_model.coef_[3])
south_top_words_idx = south_coef.sort_values(ascending=False)[0:50].index.values
south_top_words = [idx_to_word_map[idx] for idx in south_top_words_idx]

west_coef = pd.Series(data=trained_model.coef_[4])
west_top_words_idx = west_coef.sort_values(ascending=False)[0:50].index.values
west_top_words = [idx_to_word_map[idx] for idx in west_top_words_idx]

In [95]:
def clip(string):
    return string[:10] + '...' if len(string) > 13 else string

print('Canada        | Midwest       | Northeast     | South         |  West  ')
print('-----------------------------------------------------------------------')
for i in range(len(canada_top_words)):
    print('{0} | {1} | {2} | {3} | {4}'.format(
        clip(str(canada_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(midwest_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(northeast_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(south_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(west_top_words[i].encode('utf-8'))).ljust(13)
    ))

Canada        | Midwest       | Northeast     | South         |  West  
-----------------------------------------------------------------------
b'toronto'    | b'madison'    | b'pittsburgh' | b'charlotte'  | b'vegas'     
b'montreal'   | b'champaign'  | b'cleveland'  | b'clt'        | b'phoenix'   
b'flavour'    | b'wisconsin'  | b'ohio'       | b'ballantyne' | b'scottsdale'
b'favourite'  | b'urbana'     | b'lakewood'   | b'concord'    | b'valley'    
b'neighbou... | b'middleton'  | b'cle'        | b'noda'       | b'arizona'   
b'canada'     | b'prairie'    | b'burgh'      | b'uptown'     | b'tempe'     
b'flavours'   | b'curds'      | b'shadyside'  | b'matthews'   | b'chandler'  
b'gta'        | b'capitol'    | b'pgh'        | b'huntersv... | b'casino'    
b'flavourful' | b'illinois'   | b'oakland'    | b'gastonia'   | b'mesa'      
b'yonge'      | b'campus'     | b'tremont'    | b'midwood'    | b'henderson' 
b'colour'     | b'badger'     | b'lawrence... | b'southpark'  | b'bellagio' 

#### Observation 2: This gives us a good idea of what is going on. The model does a good job of keying off of "giveaway" words that strongly indicate the location. These words include proper nouns like city names, landmarks, and companies. They also include nicknames or abbreviations for major landmarks, and even a sports team 'cavs'.  But not all is lost. Especially in the Canadian column, we can see some words that could be considered indicative of dialect, like 'flavour' and 'centre'. 

#### Observation 3: Maybe the model thinks these things are great indicators b/c they don't appear in other classes, but in reality they just don't occur that much at all. Should the model more heavily weight common words? Maybe tf-idf is a terrible call here.

#### Observation 4: This model might really be overfitting - we should try it against another data set, and/or try applying regularization

## Retrain baseline without proper nouns

In [4]:
import re
import string
import nltk
from nltk.corpus import stopwords

In [5]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=<>]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [6]:
%%time

sents = []
for i, review in enumerate(reviews['text'].values):
    sents.append(nltk.word_tokenize(review))
    if i % 10000 == 0:
        print('Completed {0} rows'.format(str(i)))

Completed 0 rows
Completed 10000 rows
Completed 20000 rows
Completed 30000 rows
Completed 40000 rows
Completed 50000 rows
Completed 60000 rows
Completed 70000 rows
Completed 80000 rows
Completed 90000 rows
CPU times: user 1min 53s, sys: 1.08 s, total: 1min 54s
Wall time: 1min 54s


In [14]:
tags = []
for i in range(0, len(sents), 10000):
    end = i+10000
    if end > len(sents):
        end = len(sents)
    batch = sents[i:end]
    tags.extend(nltk.pos_tag_sents(batch))
    print('Completed {0} rows'.format(str(i+10000)))

Completed 10000 rows
Completed 20000 rows
Completed 30000 rows
Completed 40000 rows
Completed 50000 rows
Completed 60000 rows
Completed 70000 rows
Completed 80000 rows
Completed 90000 rows
Completed 100000 rows


In [21]:
cleaned_sents = []
stops = set(stopwords.words("english"))
for i, review in enumerate(tags):
    
    ## Remove stop words
    review = [w for w in review if not w[0].lower() in stops and len(w[0]) >= 3]
    
    ## Replace proper nouns with <NNP>
    review = [w[0] if not (w[1] in ['NNP', 'NNPS']) else '<NNP>' for w in review]
    
    cleaned_sents.append(clean_text(" ".join(review)))
    
    if i % 10000 == 0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [30]:
reviews = reviews.join(pd.DataFrame(cleaned_sents, columns=['clean_text']))
reviews.head()

Unnamed: 0,business_id,review_id,user_id,latitude,longitude,region,name,postal_code,city,state,neighborhood,text,clean_text
0,0W4lkclzZThpx3V65bVgig,v0i_UHJMo_hPBq9bxWvW4w,bv2nCi5Qv5vroFiqKGopiw,45.516373,-73.577538,canada,Schwartz's,H2W 1X9,Montréal,QC,Plateau-Mont-Royal,"Love the staff, love the meat, love the place....",Love staff love meat love place <NNP> long lin...
0,eZDXz_RylvdD0tHEA8I0NA,T2cqOo7zPjaPtxdHFeZn8w,CKRfBUqQGaVCYTKN5kDrzw,35.140625,-80.73764,south,New Zealand Cafe,28270,Charlotte,NC,,Good sushi for good prices! I tried the Dancin...,Love staff love meat love place <NNP> long lin...
1,AEx2SYEUJmTxVVB18LlCwA,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,45.523335,-73.594856,canada,Wilensky's,H2T 2M1,Montréal,QC,Plateau-Mont-Royal,Super simple place but amazing nonetheless. It...,<NNP> simple place amazing nonetheless around ...
1,Rjkda__H64ILwIvaccVyLg,Da9HY5-ZKdBbStAs-Ju4YA,CKRfBUqQGaVCYTKN5kDrzw,35.212841,-80.858803,south,Blaze Fast-Fire'd Pizza,28203,Charlotte,NC,South End,Marvelous pizza! I had a make your own pizza w...,<NNP> simple place amazing nonetheless around ...
2,VR6GpWIda3SfvPC-lg9H3w,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,45.4729,-73.588318,canada,Tuck Shop,H4C 1S7,Montréal,QC,Sud-Ouest,Small unassuming place that changes their menu...,Small unassuming place changes menu every ofte...


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [33]:
# training on just one chunk initially
vect = CountVectorizer(stop_words='english', lowercase=True)
counts = vect.fit_transform(reviews['clean_text'])
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(counts)

# train a model predicting the state from the text
labels_train, labels_test, features_train, features_test = train_test_split(reviews['region'], tfidf, test_size=0.20, random_state=42)
trained_model = LogisticRegression().fit(features_train, labels_train)

In [34]:
predictions = trained_model.predict(features_test)
accuracy = metrics.accuracy_score(labels_test, predictions)
print(accuracy)

0.6701


In [35]:
print(labels_test.value_counts() / len(labels_test) * 100)

west         66.945
canada       14.995
northeast     8.880
south         6.215
midwest       2.965
Name: region, dtype: float64


In [37]:
idx_to_word_map = {v: k for k, v in vect.vocabulary_.items()}

canada_coef = pd.Series(data=trained_model.coef_[0])
canada_top_words_idx = canada_coef.sort_values(ascending=False)[0:50].index.values
canada_top_words = [idx_to_word_map[idx] for idx in canada_top_words_idx]

midwest_coef = pd.Series(data=trained_model.coef_[1])
midwest_top_words_idx = midwest_coef.sort_values(ascending=False)[0:50].index.values
midwest_top_words = [idx_to_word_map[idx] for idx in midwest_top_words_idx]

northeast_coef = pd.Series(data=trained_model.coef_[2])
northeast_top_words_idx = northeast_coef.sort_values(ascending=False)[0:50].index.values
northeast_top_words = [idx_to_word_map[idx] for idx in northeast_top_words_idx]

south_coef = pd.Series(data=trained_model.coef_[3])
south_top_words_idx = south_coef.sort_values(ascending=False)[0:50].index.values
south_top_words = [idx_to_word_map[idx] for idx in south_top_words_idx]

west_coef = pd.Series(data=trained_model.coef_[4])
west_top_words_idx = west_coef.sort_values(ascending=False)[0:50].index.values
west_top_words = [idx_to_word_map[idx] for idx in west_top_words_idx]

In [38]:
def clip(string):
    return string[:10] + '...' if len(string) > 13 else string

print('Canada        | Midwest       | Northeast     | South         |  West  ')
print('-----------------------------------------------------------------------')
for i in range(len(canada_top_words)):
    print('{0} | {1} | {2} | {3} | {4}'.format(
        clip(str(canada_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(midwest_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(northeast_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(south_top_words[i].encode('utf-8'))).ljust(13),
        clip(str(west_top_words[i].encode('utf-8'))).ljust(13)
    ))

Canada        | Midwest       | Northeast     | South         |  West  
-----------------------------------------------------------------------
b'repeat'     | b'stop'       | b'emails'     | b'descript... | b'pissed'    
b'describes'  | b'vegan'      | b'vermicelli' | b'gluten'     | b'hat'       
b'decide'     | b'insurance'  | b'liars'      | b'cent'       | b'companion' 
b'le'         | b'easily'     | b'easier'     | b'parking'    | b'lacked'    
b'fingers'    | b'friend'     | b'oxtail'     | b'husbands'   | b'billing'   
b'flies'      | b'delivered'  | b'correct'    | b'sold'       | b'overpowe...
b'shwarma'    | b'door'       | b'program'    | b'upgraded'   | b'entrees'   
b'photo'      | b'accident... | b'hve'        | b'sea'        | b'empanadas' 
b'ian'        | b'walking'    | b'mid'        | b'event'      | b'checks'    
b'lait'       | b'kidding'    | b'odd'        | b'greeted'    | b'scheduling'
b'wan'        | b'yummy'      | b'hell'       | b'uptown'     | b'pregnant' 