In [31]:
# import all libraries

import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import io
import requests
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

Using TensorFlow backend.


In [32]:
# functions given in the lab

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [2]:
# Create dataframes and creating merge_df file

business_df= pd.read_csv('business_postal.tsv', delimiter ="\t")
review_df= pd.read_csv('review_stars.tsv', delimiter ="\t")
review_agg_df = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')
print(merge_df.dtypes)
print(merge_df.shape)
merge_df = merge_df[0:10000]
print(merge_df.dtypes)
print(merge_df.shape)

business_id      object
categories       object
stars           float64
review_count      int64
postal code      object
all_reviews      object
dtype: object
(188593, 6)
business_id      object
categories       object
stars           float64
review_count      int64
postal code      object
all_reviews      object
dtype: object
(10000, 6)


In [3]:
merge_df.to_csv('merge_df.tsv', sep='\t', encoding='utf-8')

In [9]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,review_count,postal code,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,24,T2E 6L6,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,3,89002,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,5,H2G 1K7,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,8,85003,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,4,T2H 0N5,"b""This listing was originally under gardening ..."


In [10]:
#Normalization of review count field so it becomes comparable and remove bias
merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)

In [11]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,postal code,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.00746,T2E 6L6,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,89002,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.00071,H2G 1K7,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.001776,85003,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000355,T2H 0N5,"b""This listing was originally under gardening ..."


In [12]:
#TF-IDF calculation

tfidf = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

# Fit the reviews column with TFIDFvectorizer
tfidf_matrix = tfidf.fit_transform(merge_df['all_reviews'])
matrix = tfidf_matrix.toarray()

# We are adding the normalized count to the original matrix with TFIDFvectorizer
x_matrix=np.column_stack((matrix, merge_df['normalized_count']))

print(x_matrix.dtype)

float64


In [29]:
x_matrix.shape

(10000, 1001)

In [14]:
y_stars = merge_df['stars'].values.astype(np.float32)

In [15]:
y_stars.shape

(10000,)

In [16]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_df['encoded_stars'] = le.fit_transform(merge_df['stars'])

In [17]:
y_stars_encoded = merge_df['encoded_stars'].values.astype(np.float32)

In [19]:
y_stars_encoded.shape

(10000,)

In [20]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,postal code,all_reviews,encoded_stars
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.00746,T2E 6L6,b'Great place. Major flaw is how early it clos...,6
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,89002,"b""CK's BBQ is off the charts best BBQ I have E...",7
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.00071,H2G 1K7,"b""La nourriture est excellente, le service Imp...",6
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.001776,85003,b'GEICO for auto is great! But they are really...,1
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000355,T2H 0N5,"b""This listing was originally under gardening ...",2


In [22]:
#train test data for linear regression

x_cont_train, x_cont_test, y_cont_train, y_cont_test = train_test_split(x_matrix, y_stars , test_size=0.2)

# train test dat afor other models

x_enc_train, x_enc_test, y_enc_train, y_enc_test = train_test_split(x_matrix, y_stars_encoded , test_size=0.2)

In [34]:
print("X Train size ")
print(x_cont_train.shape)
print("X Test size ")
print(x_cont_test.shape)
print("y Train size ")
print(y_cont_train.shape)
print("y Test size ")
print(y_cont_test.shape)

X Train size 
(8000, 1001)
X Test size 
(2000, 1001)
y Train size 
(8000,)
y Test size 
(2000,)


In [48]:
# Tensorflow model for Regression

model = Sequential()

model.add(Dense(25, input_dim=x_cont_train.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(x_cont_train,y_cont_train,verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

Epoch 1/100
 - 0s - loss: 3.3680
Epoch 2/100
 - 0s - loss: 0.4735
Epoch 3/100
 - 0s - loss: 0.3190
Epoch 4/100
 - 0s - loss: 0.2757
Epoch 5/100
 - 0s - loss: 0.2574
Epoch 6/100
 - 0s - loss: 0.2439
Epoch 7/100
 - 0s - loss: 0.2345
Epoch 8/100
 - 0s - loss: 0.2238
Epoch 9/100
 - 0s - loss: 0.2021
Epoch 10/100
 - 0s - loss: 0.1828
Epoch 11/100
 - 0s - loss: 0.1642
Epoch 12/100
 - 0s - loss: 0.1468
Epoch 13/100
 - 0s - loss: 0.1309
Epoch 14/100
 - 0s - loss: 0.1161
Epoch 15/100
 - 0s - loss: 0.1023
Epoch 16/100
 - 0s - loss: 0.0906
Epoch 17/100
 - 0s - loss: 0.0783
Epoch 18/100
 - 0s - loss: 0.0701
Epoch 19/100
 - 0s - loss: 0.0633
Epoch 20/100
 - 0s - loss: 0.0574
Epoch 21/100
 - 0s - loss: 0.0505
Epoch 22/100
 - 0s - loss: 0.0457
Epoch 23/100
 - 0s - loss: 0.0414
Epoch 24/100
 - 0s - loss: 0.0392
Epoch 25/100
 - 0s - loss: 0.0358
Epoch 26/100
 - 0s - loss: 0.0336
Epoch 27/100
 - 0s - loss: 0.0309
Epoch 28/100
 - 0s - loss: 0.0295
Epoch 29/100
 - 0s - loss: 0.0284
Epoch 30/100
 - 0s - lo

<keras.callbacks.History at 0x1a74914390>

In [49]:
pred_reg = model.predict(x_cont_test)
print("Shape: {}".format(pred_reg.shape))
print(pred_reg)

Shape: (2000, 1)
[[4.1964087]
 [4.0715914]
 [5.230362 ]
 ...
 [4.7272015]
 [2.5711498]
 [2.6085525]]


In [50]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_stars[i],pred_reg[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.1964087]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 4.5, predicted Stars: [4.0715914]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.0, predicted Stars: [5.230362]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 1.5, predicted Stars: [4.37503]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 2.0, predicted Stars: [2.6806989]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 4.0, predicted Stars: [4.100512]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.0, predicted Stars: [3.7186904]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 2.0, predicted Stars: [2.2923532]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 2.5, predicted Stars: [2.2200475]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 3.5, predicted Stars: [4.04173]


In [51]:
# Measure RMSE error.  RMSE is common for regression.
score = mean_squared_error(y_cont_test,pred_reg)
print("Final score (RMSE): {}".format(score))
print('R2 score: %.2f' % r2_score(y_cont_test, pred_reg))

Final score (RMSE): 0.4247252941131592
R2 score: 0.60
