## Restaurant Reviews Project

question - who writes reviews, context, what would a business want?

In [103]:
import pandas as pd
from pandas.io.json import json_normalize

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize

%matplotlib inline

pd.options.display.max_colwidth = 10000

In [104]:
np.random.seed(3)

In [7]:
import requests
import json
import time
import os

class GooglePlaces(object):
    def __init__(self, apiKey):
        super(GooglePlaces, self).__init__()
        self.apiKey = apiKey

    def search_places_by_coordinate(self, location, radius, types):
        endpoint_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
        places = []
        params = {
            'location': location,
            'radius': radius,
            'types': types,
            'key': self.apiKey
        }
        res = requests.get(endpoint_url, params = params)
        results =  json.loads(res.content)
        places.extend(results['results'])
        time.sleep(2)
        while "next_page_token" in results:
            params['pagetoken'] = results['next_page_token'],
            res = requests.get(endpoint_url, params = params)
            results = json.loads(res.content)
            places.extend(results['results'])
            time.sleep(2)
        return places

    def get_place_details(self, place_id, fields):
        endpoint_url = "https://maps.googleapis.com/maps/api/place/details/json"
        params = {
            'placeid': place_id,
            'fields': ",".join(fields),
            'key': self.apiKey
        }
        res = requests.get(endpoint_url, params = params)
        place_details =  json.loads(res.content)
        return place_details

if __name__ == '__main__':
    api = GooglePlaces(os.environ.get("API_KEY"))
    places = api.search_places_by_coordinate("47.603230,-122.330280", "8000", "restaurant")
    fields = ['name', 'formatted_address', 'geometry', 'price_level', 'user_ratings_total', 'website', 'rating', 'review']
    data = []
    for place in places:
        details = api.get_place_details(place['place_id'], fields)
    `    data.append(details['result'])

In [8]:
data[0].keys()

dict_keys(['formatted_address', 'geometry', 'name', 'price_level', 'rating', 'reviews', 'user_ratings_total', 'website'])

In [None]:
data[0]['reviews'][0]['text']

In [9]:
# tries approach with json_normalize
df_raw = json_normalize(data, 'reviews', ['formatted_address',
                                          'name', 'geometry', 'price_level', 'rating', 'user_ratings_total', 'website'], 
                        errors='ignore', record_prefix='_')

In [10]:
df_raw

Unnamed: 0,_author_name,_author_url,_language,_profile_photo_url,_rating,_relative_time_description,_text,_time,formatted_address,name,geometry,price_level,rating,user_ratings_total,website
0,Julian Rich,https://www.google.com/maps/contrib/104271241910234326973/reviews,en,https://lh4.googleusercontent.com/-S47rKBYBBiM/AAAAAAAAAAI/AAAAAAAAAEg/BhSr5GRmLQY/s128-c0x00000000-cc-rp-mo/photo.jpg,4,2 weeks ago,"A very good hotel in the perfect location. I had a nice view and everything I wanted was easily accessible. The room was spacious and properly furnished. I wish it was a little bit cheaper though, specially the in room dining which is super expensive. I also especially loved the lobby. It is simply gorgeous and takes my breath away.",1557514052,"411 University St, Seattle, WA 98101, USA",Fairmont Olympic Hotel - Seattle,"{'location': {'lat': 47.60808309999999, 'lng': -122.3340015}, 'viewport': {'northeast': {'lat': 47.6094411802915, 'lng': -122.3326718197085}, 'southwest': {'lat': 47.6067432197085, 'lng': -122.3353697802915}}}",4.0,4.6,2317,https://www.fairmont.com/seattle/
1,Randy Maxwell,https://www.google.com/maps/contrib/114988574785000074437/reviews,en,https://lh3.googleusercontent.com/-VfUwinLHk6A/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rdAJu0zepLgN7aZfN93ayNizUwl_Q/s128-c0x00000000-cc-rp-mo-ba2/photo.jpg,5,a month ago,"We visited this Fairmont for two nights while in Seattle. We always try to stay at Fairmont properties while traveling. The Fairmont Olympic is a beautiful Hotel with Rich history. There is always one person at every Fairmont that makes you feel Special and right at home, for us it was Vincent. From the moment I we met Vincent he made us feel like family. Vincent's customer service is perfect in every way, and simply put he is the.reason we will return. I would highly recommend this hotel when in Seattle, and if a big fellow with a warm smile asks you if there is anything you need, you have found Vincent and you will be pleased you did.",1554082831,"411 University St, Seattle, WA 98101, USA",Fairmont Olympic Hotel - Seattle,"{'location': {'lat': 47.60808309999999, 'lng': -122.3340015}, 'viewport': {'northeast': {'lat': 47.6094411802915, 'lng': -122.3326718197085}, 'southwest': {'lat': 47.6067432197085, 'lng': -122.3353697802915}}}",4.0,4.6,2317,https://www.fairmont.com/seattle/
2,KKD City,https://www.google.com/maps/contrib/117174992497074442220/reviews,en,https://lh6.googleusercontent.com/-IXdE2U_LVFY/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rcF6atUHEuy39k8spD9hf0yCRNNGw/s128-c0x00000000-cc-rp-mo/photo.jpg,5,a week ago,"We were in town for an Awards Ceremony and were lucky to stay at the most wonderful Fairmont. Same consistent service, amenities and service as the SF Fairmont. Elegant and fabulous location. Not a single complaint - all compliments.",1558404135,"411 University St, Seattle, WA 98101, USA",Fairmont Olympic Hotel - Seattle,"{'location': {'lat': 47.60808309999999, 'lng': -122.3340015}, 'viewport': {'northeast': {'lat': 47.6094411802915, 'lng': -122.3326718197085}, 'southwest': {'lat': 47.6067432197085, 'lng': -122.3353697802915}}}",4.0,4.6,2317,https://www.fairmont.com/seattle/
3,Tom Elwood,https://www.google.com/maps/contrib/113614708637666662631/reviews,en,https://lh6.googleusercontent.com/-ut2l1lYkHfs/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rd6MaSvJF6I4uiFxqWBPwPomEClpw/s128-c0x00000000-cc-rp-mo-ba4/photo.jpg,5,a month ago,"If you really want to indulge yourself...this is the place. Sumptuous comfortable bed, nicely decorated spacious room with comfortable furniture. Marble shower stall. Great aroma from the bath products. An incredibly comfortable and welcoming lobby, classic architecture, high ceilings... Staff that go above and beyond to remember your name hours after you meet them, and obviously care a lot about the guest experience.",1555017539,"411 University St, Seattle, WA 98101, USA",Fairmont Olympic Hotel - Seattle,"{'location': {'lat': 47.60808309999999, 'lng': -122.3340015}, 'viewport': {'northeast': {'lat': 47.6094411802915, 'lng': -122.3326718197085}, 'southwest': {'lat': 47.6067432197085, 'lng': -122.3353697802915}}}",4.0,4.6,2317,https://www.fairmont.com/seattle/
4,Nimil Parikh,https://www.google.com/maps/contrib/118142597414616640008/reviews,en,https://lh5.googleusercontent.com/-hcOXL_wZHK0/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rerYCorpBHl1ZGmd46md7tuAEi0yg/s128-c0x00000000-cc-rp-mo-ba3/photo.jpg,5,a month ago,"Hotel was very nice. Staff was courteous. Amenities were good. Food was not bad for in room dining.. only thing which i think can improve is the lighting inside the hotel.. Seattle isn't very bright outside, Having a dull environment inside hotel and in rooms isn't helping",1555595375,"411 University St, Seattle, WA 98101, USA",Fairmont Olympic Hotel - Seattle,"{'location': {'lat': 47.60808309999999, 'lng': -122.3340015}, 'viewport': {'northeast': {'lat': 47.6094411802915, 'lng': -122.3326718197085}, 'southwest': {'lat': 47.6067432197085, 'lng': -122.3353697802915}}}",4.0,4.6,2317,https://www.fairmont.com/seattle/
5,Shannon Wolfe,https://www.google.com/maps/contrib/106170866296048598276/reviews,en,https://lh5.googleusercontent.com/-WgKJvtl_Ehs/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rcsZtogqQ8d7otiCSIaBVp8KFEOVg/s128-c0x00000000-cc-rp-mo-ba2/photo.jpg,5,a month ago,"Absolute love!! They were the best!! Room was great. I love that they have a whole list of ""ij case you forgot"" items that you can get for free or borrow. Even though I only came and went for work and then again for evening activities they all knew my name! Wine hour is also hard to beat. And they're dog friendly?! That's amazing!! Will totally look for Kimpton again.",1553984925,"1100 5th Ave, Seattle, WA 98101, USA",Kimpton Hotel Vintage Seattle,"{'location': {'lat': 47.607657, 'lng': -122.3323451}, 'viewport': {'northeast': {'lat': 47.6089635802915, 'lng': -122.3311902197085}, 'southwest': {'lat': 47.6062656197085, 'lng': -122.3338881802915}}}",4.0,4.4,577,https://www.hotelvintage-seattle.com/?cm_mmc=GoogleMaps-_-cp-_-US-_-VPK
6,Ryan Mirl,https://www.google.com/maps/contrib/115042237181003411605/reviews,en,https://lh3.googleusercontent.com/-_fW8ardP8aU/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3rd-4xkKiP3E4yCJrDmnJAIVflmALw/s128-c0x00000000-cc-rp-mo-ba5/photo.jpg,5,2 months ago,What a wonderful place to stay in Seattle. Great location and amazing staff. Thomas was incredibly helpful and made sure we were taken care of during our stay. The room was clean with comfortable beds and fun wine themed decor. Definitely recommend for business or pleasure.,1551753734,"1100 5th Ave, Seattle, WA 98101, USA",Kimpton Hotel Vintage Seattle,"{'location': {'lat': 47.607657, 'lng': -122.3323451}, 'viewport': {'northeast': {'lat': 47.6089635802915, 'lng': -122.3311902197085}, 'southwest': {'lat': 47.6062656197085, 'lng': -122.3338881802915}}}",4.0,4.4,577,https://www.hotelvintage-seattle.com/?cm_mmc=GoogleMaps-_-cp-_-US-_-VPK
7,Eugénie Mendy,https://www.google.com/maps/contrib/105873096122710530329/reviews,en,https://lh3.googleusercontent.com/-ZiWVpZDAOlQ/AAAAAAAAAAI/AAAAAAAAAuY/IKRM3BxP1q4/s128-c0x00000000-cc-rp-mo-ba3/photo.jpg,4,2 months ago,"The hotel is well located, it's in the downtown area. Walking distance from all the main attractions. The ambiance was nice: Wine tasting in the lobby. The staff was courteous and friendly. Two things to be improved : hotel parking is a bit pricy ($42 + taxes ) and the room soundproofing. Overall it was great!",1552171823,"1100 5th Ave, Seattle, WA 98101, USA",Kimpton Hotel Vintage Seattle,"{'location': {'lat': 47.607657, 'lng': -122.3323451}, 'viewport': {'northeast': {'lat': 47.6089635802915, 'lng': -122.3311902197085}, 'southwest': {'lat': 47.6062656197085, 'lng': -122.3338881802915}}}",4.0,4.4,577,https://www.hotelvintage-seattle.com/?cm_mmc=GoogleMaps-_-cp-_-US-_-VPK
8,Carolann Ouellette,https://www.google.com/maps/contrib/113672661614341785288/reviews,en,https://lh5.googleusercontent.com/-7-hiyYNfYN0/AAAAAAAAAAI/AAAAAAAAAAA/ACHi3re_wmyUWMAbeK4yEONGNNjesPYKcg/s128-c0x00000000-cc-rp-mo/photo.jpg,5,2 months ago,"If anybody is trying to find good deals for hotels visit HotelBuIly. com booked a hotel on there and they had the best rates anywhere!\n\nGreat hotel. We expected a little bit more than we were promised on the phone, but the people working there quickly fixed our problem. The view is really nice considering you’re in the middle of downtown. The bathrooms are a little small, but the hotel rooms themselves is great.",1552819136,"1100 5th Ave, Seattle, WA 98101, USA",Kimpton Hotel Vintage Seattle,"{'location': {'lat': 47.607657, 'lng': -122.3323451}, 'viewport': {'northeast': {'lat': 47.6089635802915, 'lng': -122.3311902197085}, 'southwest': {'lat': 47.6062656197085, 'lng': -122.3338881802915}}}",4.0,4.4,577,https://www.hotelvintage-seattle.com/?cm_mmc=GoogleMaps-_-cp-_-US-_-VPK
9,ranten21,https://www.google.com/maps/contrib/104120971965045132020/reviews,en,https://lh5.googleusercontent.com/-saeuHMTG2Fk/AAAAAAAAAAI/AAAAAAAAARw/ADJiotvE6og/s128-c0x00000000-cc-rp-mo-ba3/photo.jpg,5,2 months ago,This was a fantastic place to say. I was always welcomed. Everyone was friendly. Have to kind of figure out where the entrance is because it's not too obvious. Clean hotel. Great products. Spacious rooms. My favorite touch was having an umbrella since i didnt pack mine.,1552351872,"1100 5th Ave, Seattle, WA 98101, USA",Kimpton Hotel Vintage Seattle,"{'location': {'lat': 47.607657, 'lng': -122.3323451}, 'viewport': {'northeast': {'lat': 47.6089635802915, 'lng': -122.3311902197085}, 'southwest': {'lat': 47.6062656197085, 'lng': -122.3338881802915}}}",4.0,4.4,577,https://www.hotelvintage-seattle.com/?cm_mmc=GoogleMaps-_-cp-_-US-_-VPK


In [105]:
# pickle / unpickle raw dataframe
# df_raw.to_pickle('df_raw.pkl')
df_raw = pd.read_pickle('df_raw.pkl')

In [None]:
#TODO (Lee) - what to do with date-time nature of review data
#TODO (Lee) - distribution of ratings - histogram
#TODO (Lee) - some of these ratings from hotels are mixed between hotels and restaurants - multiple tags?

In [106]:
# subsets dataframe
df = df_raw[['_text', '_rating']].copy()

In [107]:
df.head(5)

Unnamed: 0,_text,_rating
0,"A very good hotel in the perfect location. I had a nice view and everything I wanted was easily accessible. The room was spacious and properly furnished. I wish it was a little bit cheaper though, specially the in room dining which is super expensive. I also especially loved the lobby. It is simply gorgeous and takes my breath away.",4
1,"We visited this Fairmont for two nights while in Seattle. We always try to stay at Fairmont properties while traveling. The Fairmont Olympic is a beautiful Hotel with Rich history. There is always one person at every Fairmont that makes you feel Special and right at home, for us it was Vincent. From the moment I we met Vincent he made us feel like family. Vincent's customer service is perfect in every way, and simply put he is the.reason we will return. I would highly recommend this hotel when in Seattle, and if a big fellow with a warm smile asks you if there is anything you need, you have found Vincent and you will be pleased you did.",5
2,"We were in town for an Awards Ceremony and were lucky to stay at the most wonderful Fairmont. Same consistent service, amenities and service as the SF Fairmont. Elegant and fabulous location. Not a single complaint - all compliments.",5
3,"If you really want to indulge yourself...this is the place. Sumptuous comfortable bed, nicely decorated spacious room with comfortable furniture. Marble shower stall. Great aroma from the bath products. An incredibly comfortable and welcoming lobby, classic architecture, high ceilings... Staff that go above and beyond to remember your name hours after you meet them, and obviously care a lot about the guest experience.",5
4,"Hotel was very nice. Staff was courteous. Amenities were good. Food was not bad for in room dining.. only thing which i think can improve is the lighting inside the hotel.. Seattle isn't very bright outside, Having a dull environment inside hotel and in rooms isn't helping",5


In [27]:
#### partition into train and test sets

In [70]:
#### shuffle data

train_features, test_features, train_labels, test_labels = train_test_split(df['_text'], df['_rating'], test_size=0.2, random_state=42)

#### tokenization

#### N-gram vectors option
Represents review texts as n-gram vectors using a bag-of-words approach, discarding information about word order and grammar. Representation passed to models that don’t take ordering into account, such as logistic regression, multi-layer perceptrons, gradient boosting machines, support vector machines.

In [29]:
def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # learn vocabulary from training texts and vectorize training texts
    x_train = vectorizer.fit_transform(train_texts)

    # TODO (Lee) - data leakage via vectorization of test data at this stage? Clarify
    # vectorize validation texts
    x_val = vectorizer.transform(val_texts)

    # select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [44]:
# train_texts: list, training text strings.
training_texts = df['_text']

# train_labels: np.ndarray, training labels
training_labels = df['_rating'].values

In [49]:
# define vectorization parameters 

# define n-gram size for tokenizing text
NGRAM_RANGE = (1, 1)

# limit number of features to top 20K features
TOP_K = 20000

# define stop words
STOP_WORDS = 'english'

# define tokenization split by word n-grams
TOKEN_MODE = 'word'

# define minimum document/corpus frequency for inclusion of token
MIN_DOCUMENT_FREQUENCY = 2

In [50]:
stop_words

<module 'sklearn.feature_extraction.stop_words' from '/anaconda3/lib/python3.6/site-packages/sklearn/feature_extraction/stop_words.py'>

In [51]:
from sklearn.feature_extraction import stop_words
 
print(stop_words.ENGLISH_STOP_WORDS)

frozenset({'beforehand', 'everything', 'twelve', 'while', 'itself', 'less', 'hereupon', 'seeming', 'whom', 'done', 'another', 'somehow', 'someone', 'several', 'thereafter', 'well', 'others', 'still', 'namely', 'or', 'through', 'cry', 'every', 'thru', 'etc', 'eight', 'nowhere', 'upon', 'because', 'her', 'latterly', 'most', 'nothing', 'thick', 'do', 'whither', 'fire', 'five', 'should', 'would', 'almost', 'up', 'nobody', 'something', 'either', 'may', 'your', 'beyond', 'at', 'call', 'him', 'within', 'an', 'own', 'moreover', 'over', 'his', 'all', 'full', 'in', 'for', 'us', 'fifty', 'mine', 'am', 'even', 'hereby', 'these', 'anyhow', 'everywhere', 'had', 'latter', 'whether', 'whereby', 'back', 'de', 'least', 'therein', 'six', 'thereupon', 'from', 'herein', 'although', 'must', 'otherwise', 'sometimes', 'already', 'get', 'also', 'mostly', 'after', 'therefore', 'further', 'me', 'hers', 'when', 'might', 'put', 'without', 'about', 'next', 'give', 'each', 'anywhere', 'such', 'below', 'fill', 'hundr

In [71]:
ngram_vectorize(train_features, train_labels, test_features)

(<240x1102 sparse matrix of type '<class 'numpy.float32'>'
 	with 10240 stored elements in Compressed Sparse Row format>,
 <60x1102 sparse matrix of type '<class 'numpy.float32'>'
 	with 2081 stored elements in Compressed Sparse Row format>)

#### vectorization - separate process via sklearn

In [72]:
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Create keyword arguments to pass to the 'tf-idf' vectorizer.
kwargs = {
            'ngram_range': NGRAM_RANGE,
            'stop_words': STOP_WORDS,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
vectorizer = TfidfVectorizer(**kwargs)

In [73]:
# learn vocabulary from training texts and vectorize training texts
x_train = vectorizer.fit_transform(train_features)

In [55]:
# X = vectorizer.fit_transform(df_raw['_text'])

In [74]:
# this value of 14164 elements appears to be the total number of duplicated words (tokens)
type(x_train)

scipy.sparse.csr.csr_matrix

In [75]:
print(vectorizer.get_feature_names())

['00', '10', '11', '12', '15', '20', '30', '50', 'able', 'absolutely', 'accommodate', 'accommodating', 'acres', 'actually', 'added', 'additional', 'admit', 'advance', 'advantages', 'advertised', 'ahead', 'alaskan', 'alcohol', 'alley', 'allow', 'amazing', 'amazingly', 'ambiance', 'ambience', 'amenities', 'angeles', 'anniversary', 'apart', 'appetizer', 'appetizers', 'approached', 'apps', 'area', 'areas', 'aren', 'arrival', 'arrive', 'arrived', 'ask', 'asked', 'asking', 'ate', 'atmosphere', 'attentive', 'attractions', 'authentic', 'available', 'average', 'avoid', 'away', 'awesome', 'background', 'bad', 'bags', 'baked', 'bar', 'bartender', 'basically', 'bathroom', 'bathrooms', 'bathtub', 'beautiful', 'bed', 'beds', 'beef', 'beer', 'beers', 'beginning', 'benedict', 'best', 'better', 'big', 'biggest', 'birthday', 'biscuit', 'biscuits', 'bit', 'bite', 'black', 'bloody', 'blue', 'boats', 'boot', 'booth', 'booths', 'bought', 'bowling', 'boy', 'boyfriend', 'bread', 'breakfast', 'bright', 'bring'

In [76]:
len(vectorizer.vocabulary_)

910

In [78]:
# 240 documents (reviews), and 910 features
print(x_train.shape)

(240, 910)


In [91]:
print(x_train[0])

  (0, 596)	0.05142042538485723
  (0, 52)	0.11428738500780977
  (0, 299)	0.10290657726610454
  (0, 105)	0.6174394635966272
  (0, 75)	0.09509699809248778
  (0, 129)	0.1356220120618337
  (0, 118)	0.12837829056455213
  (0, 128)	0.25675658112910427
  (0, 159)	0.12837829056455213
  (0, 574)	0.1356220120618337
  (0, 133)	0.1079593864947271
  (0, 437)	0.2712440241236674
  (0, 526)	0.2712440241236674
  (0, 867)	0.10071566499744555
  (0, 152)	0.1356220120618337
  (0, 528)	0.11092511703235808
  (0, 617)	0.08914482158462776
  (0, 358)	0.1533777711039062
  (0, 709)	0.12275962365959439
  (0, 575)	0.11816883852963964
  (0, 288)	0.11428738500780977
  (0, 14)	0.12837829056455213
  (0, 460)	0.13715819446699226
  (0, 552)	0.1356220120618337
  (0, 337)	0.22185023406471616
  (0, 743)	0.11816883852963964
  (0, 192)	0.11428738500780977
  (0, 554)	0.12837829056455213
  (0, 636)	0.1079593864947271
  (0, 798)	0.12837829056455213


In [90]:
print(x_train[1])

  (0, 237)	0.18066202143332546
  (0, 444)	0.21439541235984724
  (0, 473)	0.3260010025907996
  (0, 897)	0.20866326585800396
  (0, 252)	0.18716059315762656
  (0, 100)	0.24812880328636905
  (0, 131)	0.24812880328636905
  (0, 491)	0.15894256888319386
  (0, 783)	0.18066202143332546
  (0, 832)	0.16516383503782883
  (0, 53)	0.22839603457218652
  (0, 322)	0.24184818226525334
  (0, 643)	0.1373010458866267
  (0, 266)	0.19076720552733106
  (0, 262)	0.2372690770113033
  (0, 683)	0.20353568608478148
  (0, 728)	0.2621294254987083
  (0, 360)	0.18066202143332546
  (0, 365)	0.2621294254987083
  (0, 213)	0.129267110368878
  (0, 714)	0.09553371944235622
  (0, 413)	0.2621294254987083


### Tokenization

In [None]:
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""
tokenized_text=sent_tokenize(text)
print(tokenized_text)

In [None]:
# Packt option
text_lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',LR())])

text_lr_clf = text_lr_clf.fit(twenty_train.data, twenty_train.target)


In [None]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]

In [None]:
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        # construct object dtype array with two columns
        # first column = 'subject' and second column = 'body'
        features = np.empty(shape=(len(posts), 2), dtype=object)
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features[i, 1] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features[i, 0] = sub

        return features

In [None]:
pipeline = Pipeline([
    # Extract the subject & body
    ('subjectbody', SubjectBodyExtractor()),

    # Use ColumnTransformer to combine the features from subject and body
    ('union', ColumnTransformer(
        [
            
            # Pipeline for standard bag-of-words model for body (second column)
            ('review_bow', Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ]), 1),

            # Pipeline for pulling ad hoc features from post's body
            ('body_stats', Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ]), 1),
        ],

        # weight components in ColumnTransformer
        transformer_weights={
            'subject': 0.8,
            'body_bow': 0.5,
            'body_stats': 1.0,
        }
    )),

    # Use a SVC classifier on the combined features
    ('svc', LinearSVC()),
], verbose=True)

pipeline.fit(train.data, train.target)
y = pipeline.predict(test.data)
print(classification_report(y, test.target))

In [None]:
### 3rd option - pandas and sklearn

In [93]:
model = LogisticRegression()

In [95]:
model.fit(x_train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
x_test = vectorizer.transform(test_features)

In [99]:
model.predict(x_test)

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])

In [101]:
# this is the issue with the review data
test_labels

203    5
266    5
152    5
9      5
233    5
226    5
196    4
109    5
5      5
175    4
237    5
57     5
218    4
45     4
182    5
221    5
289    5
211    5
148    5
165    5
78     5
113    1
249    5
250    5
104    5
42     5
281    5
295    5
157    4
238    5
17     5
164    5
33     5
24     5
215    5
119    3
7      4
90     4
46     5
73     4
93     5
76     2
286    5
60     5
77     4
63     5
234    4
229    5
111    4
231    5
180    5
144    5
239    4
75     5
297    4
278    4
97     4
92     5
192    5
25     5
Name: _rating, dtype: int64

In [None]:
### PLAYGROUND