In [4]:
import graphlab as gl

In [5]:
import os

In [6]:
filename = 'yelp-data.csv'

In [7]:
if os.path.exists(filename):
    data = gl.SFrame.read_csv(filename)
else:
    data =  gl.SFrame('https://static.turi.com/datasets/regression/{}'.format(filename))
    data.save(filename, format='csv')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\User\AppData\Local\Temp\graphlab_server_1516366826.log.0


This non-commercial license of GraphLab Create for academic use is assigned to padhigayatri.official@gmail.com and will expire on January 02, 2019.


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,long,str,str,str,dict,long,long,long,list,str,str,float,float,str,long,long,float,str,str,float,str,long,str,long,long,long,dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [8]:
train_data, test_data = data.random_split(0.9)


In [9]:
numeric_features = ['user_avg_stars', 
                    'business_avg_stars', 
                    'user_review_count', 
                    'business_review_count']

In [10]:
for ftr in numeric_features:
    mean = train_data[ftr].mean()
    stdev = train_data[ftr].std()
    train_data[ftr] = (train_data[ftr] - mean) / stdev
    test_data[ftr] = (test_data[ftr] - mean) / stdev

In [11]:
m = gl.nearest_neighbor_classifier.create(train_data, target='stars',
                                          features=numeric_features)

In [12]:
predictions = m.classify(test_data, max_neighbors=20, radius=None)

In [13]:
print predictions

+-------+-------------+
| class | probability |
+-------+-------------+
|   4   |     0.45    |
|   5   |     0.7     |
|   4   |     0.35    |
|   4   |     0.5     |
|   5   |     0.45    |
|   4   |     0.45    |
|   3   |     0.35    |
|   5   |     0.55    |
|   5   |     0.8     |
|   4   |     0.65    |
+-------+-------------+
[21815 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [14]:
topk = m.predict_topk(test_data[:5], max_neighbors=20, k=3)

In [15]:
print topk

+--------+-------+-------------+
| row_id | class | probability |
+--------+-------+-------------+
|   2    |   4   |     0.35    |
|   2    |   3   |     0.25    |
|   2    |   2   |     0.2     |
|   0    |   4   |     0.45    |
|   0    |   5   |     0.35    |
|   0    |   3   |     0.15    |
|   4    |   5   |     0.45    |
|   4    |   4   |     0.4     |
|   4    |   3   |     0.15    |
|   3    |   4   |     0.5     |
+--------+-------+-------------+
[14 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [16]:
evals = m.evaluate(test_data[:3000])
print evals['accuracy']



0.441333333333


In [17]:
conf_matrix = evals['confusion_matrix']
conf_matrix['within_one'] = conf_matrix.apply(
    lambda x: abs(x['target_label'] - x['predicted_label']) <= 1)
num_within_one = conf_matrix[conf_matrix['within_one']]['count'].sum()
print float(num_within_one) / len(test_data)

0.114370845748


In [18]:
train_data['word_counts'] = gl.text_analytics.count_words(train_data['text'],
                                                          to_lower=True)

In [19]:
test_data['word_counts'] = gl.text_analytics.count_words(test_data['text'],
                                                         to_lower=True)

In [20]:
my_dist = [
    [numeric_features, 'euclidean', 1.0],
    [['word_counts'], 'weighted_jaccard', 1.0]
    ]

In [21]:
m2 = gl.nearest_neighbor_classifier.create(train_data, target='stars',
                                          distance=my_dist)

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [22]:
accuracy = m2.evaluate(test_data[:3000], metric='accuracy')



In [23]:
print accuracy

{'accuracy': 0.4656666666666667}
