In [3]:
import json
import csv
import string
import pandas as pd
import numpy as np
from scipy.stats import zscore
import sklearn.feature_extraction.text as sk_text
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Open business.json file, create tsv file with business_id, business name, categories, and review count to be used as features 
#and stars as label

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count'])
with open('yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'] ])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t", nrows=10000)

In [3]:
#Open review.json file, create tsv file with business_id, text to be used as features 
#and stars as label

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t", nrows=10000)

In [4]:
# Group all reviews by business_id
review_agg_df = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})


In [58]:
#Merge the resulting review aggregate dataframe with business dataframe
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')

merge_df

Unnamed: 0,business_id,categories,stars,review_count,all_reviews
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,17,"b""If this place was on on fire i wouldn't reac..."
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,105,b'Love the jerk chicken sandwich and jerk chic...
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,24,b'My experience was excellent. They expedited ...
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,3,"b""I'm visiting Calgary from Toronto for a few ..."
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,1565,"b'This place is amazing. I mean, you really ca..."
5,OyJDaAAMr220qkZsovCARQ,"Food, Coffee & Tea",3.0,49,"b""My favorite Starbucks. Extremely friendly st..."
6,YkAIlxYZ1guSqbbowU9X4g,"Restaurants, Chinese, Dim Sum, Breakfast & Brunch",3.5,171,b'Came here for a lovely dinner with husband ...
7,ZQ-7uFQk21NHoOzJfhEjBw,"Coffee & Tea, Food",3.0,59,"b""Wish I had positive things to say. I ordered..."
8,2ktKjN5z8EcqmUv6EDiDgA,"Fashion, Department Stores, Automotive, Shoppi...",3.5,121,b'Got $1000 worth of tires today. They told me...
9,ohYgabP6PqkNsF0vnZUxeg,"Arts & Entertainment, Coffee & Tea, Bars, Food...",4.5,149,"b""Just. Yes.\n\nCoffee: 9.2/10\nBeer: 10/10 \n..."


In [59]:
#Normalization of review count field so it becomes comparable and remove bias

#z score can be used to normalize but MNB model does not take -ve numbers as input
#  merge_df['review_count'] = zscore(merge_df['review_count'])
#  merge_df

#alternative option

merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)
merge_df

Unnamed: 0,business_id,categories,stars,normalized_count,all_reviews
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,0.004973,"b""If this place was on on fire i wouldn't reac..."
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,0.036234,b'Love the jerk chicken sandwich and jerk chic...
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,0.007460,b'My experience was excellent. They expedited ...
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,0.000000,"b""I'm visiting Calgary from Toronto for a few ..."
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,0.554885,"b'This place is amazing. I mean, you really ca..."
5,OyJDaAAMr220qkZsovCARQ,"Food, Coffee & Tea",3.0,0.016341,"b""My favorite Starbucks. Extremely friendly st..."
6,YkAIlxYZ1guSqbbowU9X4g,"Restaurants, Chinese, Dim Sum, Breakfast & Brunch",3.5,0.059680,b'Came here for a lovely dinner with husband ...
7,ZQ-7uFQk21NHoOzJfhEjBw,"Coffee & Tea, Food",3.0,0.019893,"b""Wish I had positive things to say. I ordered..."
8,2ktKjN5z8EcqmUv6EDiDgA,"Fashion, Department Stores, Automotive, Shoppi...",3.5,0.041918,b'Got $1000 worth of tires today. They told me...
9,ohYgabP6PqkNsF0vnZUxeg,"Arts & Entertainment, Coffee & Tea, Bars, Food...",4.5,0.051865,"b""Just. Yes.\n\nCoffee: 9.2/10\nBeer: 10/10 \n..."


In [10]:
# Display all fields
print(list(merge_df.columns))

['business_id', 'categories', 'stars', 'normalized_count', 'all_reviews']


In [11]:
#Clean category field and perform one hot coding

#convert into lower case all words in category
merge_df['categories'] = merge_df['categories'].str.lower()

#Remove punctuations from categories
merge_df['categories'] = merge_df['categories'].str.replace('[^\w\s]','')

#Remove stop words
stop = stopwords.words('english')
merge_df['categories'] = merge_df['categories'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [200]:
#Clean review text 
#convert into lower case all words in category
merge_df['all_reviews'] = merge_df['all_reviews'].str.lower()

#Remove punctuations from reviews
merge_df['all_reviews'] = merge_df['all_reviews'].str.replace('[^\w\s]','')

#Remove stop words
stop = stopwords.words('english')
merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))




0      bif place fire wouldnt reach phone call help a...
1      blove jerk chicken sandwich jerk chicken dinne...
2      bmy experience excellent expedited repair got ...
3      bim visiting calgary toronto days discovered c...
4      bthis place amazing mean really cant go wrong ...
5      bmy favorite starbucks extremely friendly staf...
6      bcame lovely dinner husband weeks ago great di...
7      bwish positive things say ordered mocha latxc3...
8      bgot 1000 worth tires today told would long wa...
9      bjust yesnncoffee 9210nbeer 1010 nwine 1210 te...
10     bthis great place taste authentic japanese iza...
11     bi given store many chances live close conveni...
12     bwe locals decided try jayde since love hotel ...
13     ba squirrel hill destination easy access forbe...
14     bthis place rocks quaint little diner makes pe...
15     breally good place go bunch friends sort bar g...
16     ba less pretentious place find valley good ass...
17     bi choice place truly fi

In [201]:
#Common words display to decide for removal
freq = pd.Series(' '.join(merge_df['all_reviews']).split()).value_counts()[:10]
freq

the    2460
and    1777
a      1276
to     1235
i      1203
was     863
of      677
for     612
is      602
it      597
dtype: int64

In [202]:
#remove common words
freq = list(freq.index)
merge_df['all_reviews'] = merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
merge_df['all_reviews'].head()

0    bif this place on on fire wouldnt reach phone ...
1    blove jerk chicken sandwich jerk chicken dinne...
2    bmy experience excellent they expedited repair...
3    bim visiting calgary from toronto few days dis...
4    bthis place amazing mean you really cant go wr...
Name: all_reviews, dtype: object

In [203]:
#Rare words display to decide for removal
freq = pd.Series(' '.join(merge_df['all_reviews']).split()).value_counts()[-10:]
freq

frustrationnnanother                                1
mentioning                                          1
doneness                                            1
brestaurant                                         1
httpoldtownscottsdalecomjalapenoinfernoatdcranch    1
appreciated                                         1
somethingnnote                                      1
scratched                                           1
vitamins                                            1
pages                                               1
dtype: int64

In [204]:
# Rare words removal

freq = list(freq.index)
merge_df['all_reviews'] = merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
merge_df['all_reviews'].head()

0    bif this place on on fire wouldnt reach phone ...
1    blove jerk chicken sandwich jerk chicken dinne...
2    bmy experience excellent they expedited repair...
3    bim visiting calgary from toronto few days dis...
4    bthis place amazing mean you really cant go wr...
Name: all_reviews, dtype: object

In [173]:
# Preparing category field for one hot coding

#merge_df['categories'].str.split(',').head()




0    [Thrift Stores,  Shopping,  Used,  Vintage & C...
1          [Caribbean,  Food,  Bakeries,  Restaurants]
2              [Body Shops,  Auto Repair,  Automotive]
3    [Restaurants,  Event Planning & Services,  Ita...
4                 [Poutineries,  Restaurants,  Diners]
Name: categories, dtype: object

In [174]:
#def get_element(my_list, position):
 #   return my_list[position]

In [177]:
#merge_df['categories'] = merge_df['categories'].str.split(',').apply(get_element, position=0)

In [178]:
# one hot coding of categories
#hotcoded_df = pd.get_dummies(merge_df['categories'])

#concat all features after one hot coding
#df = pd.concat([merge_df, hotcoded_df], axis =1)
#df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,all_reviews,encoded_stars,Acai Bowls,Active Life,Acupuncture,American (New),...,Tobacco Shops,Trailer Dealers,Translation Services,Transportation,Turkish,Venues & Event Spaces,Veterinarians,Vietnamese,Water Delivery,Wine Bars
0,DR30lzIHVTF6xhyMI-3IlQ,Thrift Stores,3.5,0.004973,"b""If this place was on on fire i wouldn't reac...",5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,YIez_A3WOt9J2SXN7OMa2Q,Caribbean,4.0,0.036234,b'Love the jerk chicken sandwich and jerk chic...,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Gc8R7b3I3CTwAiWv7MjtSg,Body Shops,4.5,0.00746,b'My experience was excellent. They expedited ...,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,pIzuXtFdkj8fHuzJfYiwqw,Restaurants,4.5,0.0,"b""I'm visiting Calgary from Toronto for a few ...",7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5T6kFKFycym_GkhgOiysIw,Poutineries,4.0,0.554885,"b'This place is amazing. I mean, you really ca...",6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
#print(list(df.columns))

['business_id', 'categories', 'stars', 'normalized_count', 'all_reviews', 'encoded_stars', 'Acai Bowls', 'Active Life', 'Acupuncture', 'American (New)', 'American (Traditional)', 'Arts & Crafts', 'Arts & Entertainment', 'Asian Fusion', 'Auto Repair', 'Automotive', 'Bagels', 'Bakeries', 'Banks & Credit Unions', 'Barbeque', 'Bars', 'Beauty & Spas', 'Beer', 'Beer Bar', 'Beer Gardens', 'Body Shops', 'Boot Camps', 'Breakfast & Brunch', 'Breweries', 'Brewing Supplies', 'Bubble Tea', 'Buffets', 'Building Supplies', 'Burgers', 'Cafes', 'Cajun/Creole', 'Canadian (New)', 'Car Dealers', 'Car Rental', 'Caribbean', 'Carpeting', 'Chicken Shop', 'Chicken Wings', 'Chinese', 'Chiropractors', 'Cocktail Bars', 'Coffee & Tea', 'Comfort Food', 'Cosmetic Dentists', 'Creperies', 'Dentists', 'Desserts', 'Diners', 'Doctors', 'Donuts', 'Education', 'Electronics', 'Elementary Schools', 'Emergency Pet Hospital', 'Estate Planning Law', 'Event Planning & Services', 'Eyelash Service', 'Farmers Market', 'Fashion', 'F

In [205]:
#TF-IDF calculation
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

#Learn vocabulary and idf, return term-document matrix.
train_vect = tfidf.fit_transform(merge_df['all_reviews'])
train_vect = train_vect.toarray()
train_vect.shape

(424, 1000)

In [2]:
from sklearn import preprocessing


#Stars need to be encoded to be a label
le = preprocessing.LabelEncoder()
merge_df['encoded_stars'] = le.fit_transform(merge_df['stars'])


NameError: name 'merge_df' is not defined

In [None]:
#np.concatenate((train_vect, merge_df['normalized_count']), axis=0)
processed_df = np.column_stack((train_vect, merge_df['normalized_count']))
#processed_df = np.column_stack((train_vect, df['normalized_count']))
#new = np.column_stack((processed_df, df['Restaurants']))
#new

In [1]:
#Split into train and test data
#x_train, x_test, y_train, y_test = train_test_split(processed_df, merge_df['encoded_stars'], test_size=0.25, random_state=42)

#Split into train and test data
x_train, x_test, y_train, y_test = train_test_split(processed_df,merge_df['stars'], test_size=0.25, random_state=42)
#x_train, x_test, y_train, y_test = train_test_split(new, merge_df['encoded_stars'], test_size=0.25, random_state=42)



NameError: name 'train_test_split' is not defined

In [184]:
x_train.shape

(318, 1002)

In [185]:
x_test.shape

(106, 1002)

In [186]:
y_train.shape

(318,)

In [187]:
y_test.shape

(106,)

In [189]:
train_vect.shape

(424, 1000)

In [208]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [209]:
#Predicted values
y_pred = clf.predict(x_test)
y_pred

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)

In [210]:
# original test values
y_test.values


array([8, 5, 5, 5, 4, 3, 5, 7, 5, 6, 5, 8, 6, 7, 3, 6, 7, 6, 6, 5, 8, 7,
       4, 5, 5, 6, 5, 5, 3, 6, 5, 4, 7, 7, 6, 7, 5, 5, 2, 2, 6, 6, 4, 6,
       6, 3, 1, 6, 6, 6, 7, 4, 5, 5, 5, 8, 0, 6, 5, 5, 2, 6, 7, 6, 4, 5,
       6, 6, 3, 4, 8, 5, 2, 3, 6, 2, 6, 3, 1, 7, 5, 4, 6, 4, 3, 6, 5, 8,
       6, 8, 5, 5, 5, 4, 7, 4, 5, 5, 6, 2, 4, 4, 6, 6, 6, 7], dtype=int64)

In [211]:
#implementing Nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


In [212]:
knn.fit(x_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [213]:
#Predicted values
y_pred = knn.predict(x_test)
y_pred

array([6, 6, 3, 6, 5, 5, 6, 6, 5, 6, 8, 6, 6, 5, 8, 2, 6, 6, 7, 6, 5, 6,
       6, 6, 7, 5, 5, 6, 6, 4, 5, 7, 5, 4, 5, 7, 5, 6, 5, 8, 5, 3, 7, 6,
       5, 3, 2, 5, 7, 6, 5, 3, 6, 6, 5, 8, 5, 6, 6, 7, 4, 5, 7, 5, 6, 5,
       7, 6, 0, 6, 5, 7, 5, 7, 3, 3, 4, 7, 6, 6, 6, 3, 4, 6, 5, 1, 4, 5,
       6, 4, 5, 6, 4, 4, 5, 8, 6, 6, 2, 6, 3, 4, 5, 6, 7, 6], dtype=int64)

In [196]:
# original test values
y_test.values

array([8, 5, 5, 5, 4, 3, 5, 7, 5, 6, 5, 8, 6, 7, 3, 6, 7, 6, 6, 5, 8, 7,
       4, 5, 5, 6, 5, 5, 3, 6, 5, 4, 7, 7, 6, 7, 5, 5, 2, 2, 6, 6, 4, 6,
       6, 3, 1, 6, 6, 6, 7, 4, 5, 5, 5, 8, 0, 6, 5, 5, 2, 6, 7, 6, 4, 5,
       6, 6, 3, 4, 8, 5, 2, 3, 6, 2, 6, 3, 1, 7, 5, 4, 6, 4, 3, 6, 5, 8,
       6, 8, 5, 5, 5, 4, 7, 4, 5, 5, 6, 2, 4, 4, 6, 6, 6, 7], dtype=int64)

In [214]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train, y_train)

# predict the response for new observations
logreg.predict(x_test)

array([6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6,
       5, 5, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6,
       6, 6, 7, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6,
       6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6], dtype=int64)

In [215]:
# original test values
y_test.values

array([8, 5, 5, 5, 4, 3, 5, 7, 5, 6, 5, 8, 6, 7, 3, 6, 7, 6, 6, 5, 8, 7,
       4, 5, 5, 6, 5, 5, 3, 6, 5, 4, 7, 7, 6, 7, 5, 5, 2, 2, 6, 6, 4, 6,
       6, 3, 1, 6, 6, 6, 7, 4, 5, 5, 5, 8, 0, 6, 5, 5, 2, 6, 7, 6, 4, 5,
       6, 6, 3, 4, 8, 5, 2, 3, 6, 2, 6, 3, 1, 7, 5, 4, 6, 4, 3, 6, 5, 8,
       6, 8, 5, 5, 5, 4, 7, 4, 5, 5, 6, 2, 4, 4, 6, 6, 6, 7], dtype=int64)

In [216]:
#Linear Regrassion
from sklearn.linear_model import LinearRegression

linreg = LinearRegression().fit(x_train, y_train)

linreg.predict(x_test)

array([5.85317322, 4.44038849, 5.49597281, 5.32321953, 5.20572755,
       4.47842173, 3.08221573, 5.33454115, 5.54704761, 5.6612044 ,
       5.53692128, 7.16305061, 5.57673839, 4.42047011, 6.0619926 ,
       6.38416528, 4.51892129, 5.38876723, 6.45357047, 4.51251105,
       5.50411771, 5.19982534, 5.43972193, 6.83187122, 5.14011393,
       4.32043994, 3.38170834, 6.36841474, 6.00882772, 5.0647528 ,
       5.42437372, 6.64660671, 4.62821949, 5.36366105, 6.59528512,
       6.70849774, 4.95204633, 5.35968506, 4.21093556, 5.24393528,
       4.80035905, 6.5588849 , 5.77187075, 5.21652869, 4.84822462,
       5.05425861, 4.11968985, 4.10290633, 6.73697955, 5.44147059,
       5.24261343, 4.76095418, 6.22885589, 3.10579503, 4.59605425,
       7.24173539, 5.75807006, 5.34423195, 5.9221598 , 6.10861748,
       4.52798442, 5.98819169, 5.94411531, 4.09092718, 3.52929908,
       6.91801173, 6.9966762 , 5.33805864, 5.99490425, 4.9611633 ,
       4.07234547, 6.39490994, 4.80024896, 5.01936377, 6.38420

In [217]:
# original test values
y_test.values

array([8, 5, 5, 5, 4, 3, 5, 7, 5, 6, 5, 8, 6, 7, 3, 6, 7, 6, 6, 5, 8, 7,
       4, 5, 5, 6, 5, 5, 3, 6, 5, 4, 7, 7, 6, 7, 5, 5, 2, 2, 6, 6, 4, 6,
       6, 3, 1, 6, 6, 6, 7, 4, 5, 5, 5, 8, 0, 6, 5, 5, 2, 6, 7, 6, 4, 5,
       6, 6, 3, 4, 8, 5, 2, 3, 6, 2, 6, 3, 1, 7, 5, 4, 6, 4, 3, 6, 5, 8,
       6, 8, 5, 5, 5, 4, 7, 4, 5, 5, 6, 2, 4, 4, 6, 6, 6, 7], dtype=int64)

In [218]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(x_train, y_train)

clf.predict(x_test)

array([6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6,
       5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6,
       6, 6, 7, 6, 6, 6, 5, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)

In [219]:
# original test values
y_test.values

array([8, 5, 5, 5, 4, 3, 5, 7, 5, 6, 5, 8, 6, 7, 3, 6, 7, 6, 6, 5, 8, 7,
       4, 5, 5, 6, 5, 5, 3, 6, 5, 4, 7, 7, 6, 7, 5, 5, 2, 2, 6, 6, 4, 6,
       6, 3, 1, 6, 6, 6, 7, 4, 5, 5, 5, 8, 0, 6, 5, 5, 2, 6, 7, 6, 4, 5,
       6, 6, 3, 4, 8, 5, 2, 3, 6, 2, 6, 3, 1, 7, 5, 4, 6, 4, 3, 6, 5, 8,
       6, 8, 5, 5, 5, 4, 7, 4, 5, 5, 6, 2, 4, 4, 6, 6, 6, 7], dtype=int64)