In [1]:
import sys
import os
os.environ['SPARK_HOME'] = "spark"
sys.path.append("spark/python")
sys.path.append("spark/python/lib")

In [2]:
import httplib2
import json


def cityPos(name):
    url = "https://maps.googleapis.com/maps/api/geocode/json?" + \
          "key=AIzaSyBsZErhxaT1oVgMrT-xGLcAN5nK3UHeGBU&address=" + name
    req = httplib2.Http(".cache")
    resp, content = req.request(url, "GET")
    res = json.loads(content)
    return res["results"][0]["geometry"]

In [21]:
import csv
import json
import codecs
import time
from couch import Couch

REFORMED_FILE = "data/output0.csv"

food_dict = {}

def trans(path):
    
    con = Couch("classified2")
    jsonData = con.query_all()

    csvfile = open(REFORMED_FILE, 'w', newline='')
    writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    keys=['time', 'lat', 'lng', 'polarity', 'followers', 'following', 'food']
    writer.writerow(keys)
    i = 0
    for dic in jsonData:

        if dic['location']['coordinates'] != None:
            lng = dic['location']['coordinates'][0]
            lat = dic['location']['coordinates'][1]
        else:
            city = dic['location']['place_name']
            city = city.replace(" ","%20")
            coor = cityPos(city)
            lng = coor['location']['lng']
            lat = coor['location']['lat']
        dt = dic['created_at']['year']+'-'+trans_month(dic['created_at']['month'])+'-'+dic['created_at']['day']+\
                ' '+dic['created_at']['time']
        timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
        timestamp = time.mktime(timeArray)
        foods = dic['food_list']
        try:
            for food in foods:
                food_class = get_food_class(food)
                writer.writerow([timestamp, lat, lng, dic['polarity'], dic['user']['followers'], dic['user']\
                                 ['following'], food_class])
        except:
            continue

    csvfile.close()
    
def trans_month(month):
    month_dic = {'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 'May': '5', 'Jun': '6', \
                 'Jul': '7', 'Aug': '8', 'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    return month_dic[month]

def get_food_class(food):
    if not food in food_dict.keys():
        food_dict[food] = str(len(food_dict))
    return food_dict[food]

In [22]:
APP_NAME = "random forest test"
SPARK_URL = "local[*]"
RANDOM_SEED = 12345
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 3
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [23]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()

In [24]:
trans(REFORMED_FILE)

df = spark.read \
    .options(header = "true", inferschema = "true") \
    .csv(CSV_NAME+'.csv')

print("Total number of rows: %d" % df.count())

Total number of rows: 46




In [25]:
df.show()

+-------------+------------+-----------+--------+---------+---------+----+
|         time|         lat|        lng|polarity|followers|following|food|
+-------------+------------+-----------+--------+---------+---------+----+
|1.524673187E9| -37.7760072|144.9708071|  0.8883|      380|      564|   0|
|1.524710653E9| -33.8688197|151.2092955|  0.4019|      766|      552|   0|
|1.524712536E9| -37.8136276|144.9630576|     0.0|      853|      558|   0|
|1.524713009E9|-33.86642251|151.2012542|     0.0|       37|      169|   0|
|1.524699928E9|    -28.0183|   153.3921|   0.926|       84|      143|   0|
|1.524705058E9| -33.8688197|151.2092955| -0.2484|      154|      219|   0|
|1.524705058E9| -33.8688197|151.2092955| -0.2484|      154|      219|   1|
|1.524675517E9|   -33.88888|  151.27759|     0.0|     4111|     4550|   0|
| 1.52489073E9| -37.8136276|144.9630576| -0.3078|       93|      477|   0|
|1.524941428E9| -33.8688197|151.2092955|  0.7543|      168|      206|   0|
|1.524942922E9|    -37.88

In [26]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

print("Number of training set rows: %d" % training_data.count())
print("Number of test set rows: %d" % test_data.count())

Number of training set rows: 35
Number of test set rows: 11


In [27]:
from pyspark.mllib.tree import RandomForest
from time import *

start_time = time()
num_classes = len(food_dict)

model = RandomForest.trainClassifier(training_data, numClasses=num_classes, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=RF_MAX_DEPTH, maxBins=32, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

Time to train model: 0.319 seconds


In [28]:
predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (acc * 100))

Model accuracy: 81.818%


In [29]:
predictions.collect()

[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [30]:
labels_and_predictions.collect()

[(0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 2.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0)]

In [31]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

start_time = time()

metrics = BinaryClassificationMetrics(labels_and_predictions)
print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR * 100))
print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC * 100))

end_time = time()
elapsed_time = end_time - start_time
print("Time to evaluate model: %.3f seconds" % elapsed_time)

Area under Precision/Recall (PR) curve: 5
Area under Receiver Operating Characteristic (ROC) curve: 45.000
Time to evaluate model: 0.118 seconds
