In [1]:
import sys
import os
os.environ['SPARK_HOME'] = "spark"
sys.path.append("spark/python")
sys.path.append("spark/python/lib")

In [2]:
# get coordinates of a given city
import httplib2
import json


def cityPos(name):
    url = "https://maps.googleapis.com/maps/api/geocode/json?" + \
          "key=AIzaSyBsZErhxaT1oVgMrT-xGLcAN5nK3UHeGBU&address=" + name
    req = httplib2.Http(".cache")
    resp, content = req.request(url, "GET")
    res = json.loads(content)
    return res["results"][0]["geometry"]

In [3]:
# reform the data preparing for fitting the model
import csv
import json
import codecs
import time as t
from couch import Couch

COUCHDB_NAME = "classified2"
REFORMED_FILE = "data/output0.csv"

food_dict = {}
rev_dict = {}

def trans(path):
    con = Couch(COUCHDB_NAME)
    jsonData = con.query_all()

    csvfile = open(REFORMED_FILE, 'w', newline='')
    writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    keys=['id', 'time', 'timestamp', 'lat', 'lng', 'polarity', 'followers', 'following', 'food_class']
    writer.writerow(keys)
    
    i = 0
    for dic in jsonData:
        if dic['location']['coordinates'] is None:
            city = dic['location']['place_name']
            city = city.replace(" ","%20")
            coor = cityPos(city)
            lng = coor['location']['lng']
            lat = coor['location']['lat']
        else:
            lng = dic['location']['coordinates'][0]
            lat = dic['location']['coordinates'][1]
            
        time = dic['created_at']['day']+'-'+trans_month(dic['created_at']['month'])+'-'+dic['created_at']['year']+\
                ' '+dic['created_at']['time']
        timeArray = t.strptime(time, "%d-%m-%Y %H:%M:%S")
        timestamp = t.mktime(timeArray)
        
        foods = dic['food_list']
        if foods is None or len(foods) == 0:
            writer.writerow([i, time, timestamp, lat, lng, dic['polarity'], dic['user']['followers'], dic['user']\
                                 ['following'], "-1"])
        else:
            for food in foods:
                food_class = get_food_class(food)
                writer.writerow([i, time, timestamp, lat, lng, dic['polarity'], dic['user']['followers'], dic['user']\
                                 ['following'], food_class])
        i += 1
    csvfile.close()
    
def trans_month(month):
    month_dic = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', \
                 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    return month_dic[month]

def get_food_class(food):
    if not food in food_dict.keys():
        food_dict[food] = str(len(food_dict))
    return food_dict[food]

def generate_rev_dict():
    for key,value in food_dict.items():
        rev_dict[value] = key


In [4]:
APP_NAME = "random forest test"
SPARK_URL = "local[*]"
RANDOM_SEED = 12345
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 3
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()

In [7]:
# read data from couchdb and reform them into a dataframe
trans(REFORMED_FILE)

df = spark.read \
    .options(header = "true", inferschema = "true") \
    .csv(REFORMED_FILE)

print("Total number of rows: %d" % df.count())



Total number of rows: 49


In [8]:

df.show()

+---+-------------------+-------------+------------+-----------+--------+---------+---------+----------+
| id|               time|    timestamp|         lat|        lng|polarity|followers|following|food_class|
+---+-------------------+-------------+------------+-----------+--------+---------+---------+----------+
|  0|26-04-2018 02:19:47|1.524673187E9| -37.7760072|144.9708071|  0.8883|      380|      564|         0|
|  1|26-04-2018 12:44:13|1.524710653E9| -33.8688197|151.2092955|  0.4019|      766|      552|         0|
|  2|26-04-2018 13:15:36|1.524712536E9| -37.8136276|144.9630576|     0.0|      853|      558|         0|
|  3|26-04-2018 13:23:29|1.524713009E9|-33.86642251|151.2012542|     0.0|       37|      169|         0|
|  4|26-04-2018 09:45:28|1.524699928E9|    -28.0183|   153.3921|   0.926|       84|      143|         0|
|  5|26-04-2018 11:10:58|1.524705058E9| -33.8688197|151.2092955| -0.2484|      154|      219|         0|
|  5|26-04-2018 11:10:58|1.524705058E9| -33.8688197|151

In [9]:
# filter dataframe based on whether the data in them have food or not
df_no_food = df.filter(df['food_class'] == -1)
df_with_food = df.filter(df['food_class'] >= 0)

print("Number of rows without food: %d" % df_no_food.count())
print("number of rows with food: %d" % df_with_food.count())

Number of rows without food: 3
number of rows with food: 46


In [10]:
# transform dataframe into RDD and split reformed data into tranning data and test data
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

transformed_df = df_with_food.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[2:-1])))

splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

print("Number of training set rows: %d" % training_data.count())
print("Number of test set rows: %d" % test_data.count())

Number of training set rows: 35
Number of test set rows: 11


In [11]:
# train the model using training data
from pyspark.mllib.tree import RandomForest
from time import *

start_time = time()
num_classes = len(food_dict)

model = RandomForest.trainClassifier(training_data, numClasses=num_classes, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=RF_MAX_DEPTH, maxBins=32, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

Time to train model: 0.811 seconds


In [12]:
# make predictions using test data and calculate the accuracy
predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (acc * 100))

Model accuracy: 81.818%


In [13]:
# deal with data without food
transformed_df_no_food = df_no_food.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[2:-1])))
predict_foods = model.predict(transformed_df_no_food.map(lambda x: x.features))

In [14]:
# combine id with predicted food class
rdd_predict_foods = df_no_food.rdd.map(lambda row: row[0]).zip(predict_foods.map(int))
list_predict_food = rdd_predict_foods.collect()

In [15]:
# transform predicted rdd to dataframe and join it to original data that without food
df_predict_foods = spark.createDataFrame(list_predict_food, schema=["id","food_class"])
df_no_food = df_no_food.drop('food_class')

concat_df = df_no_food.join(df_predict_foods, on='id')
concat_df.show()

+---+-------------------+-------------+-----------+-----------+--------+---------+---------+----------+
| id|               time|    timestamp|        lat|        lng|polarity|followers|following|food_class|
+---+-------------------+-------------+-----------+-----------+--------+---------+---------+----------+
| 13|29-04-2018 03:01:56|1.524934916E9|-37.8136276|144.9630576|     0.0|     3965|     4321|         0|
| 14|29-04-2018 03:04:03|1.524935043E9|-37.8136276|144.9630576|   0.658|      849|     3987|         0|
| 30|29-04-2018 01:16:16|1.524928576E9|-37.8136276|144.9630576|  0.9366|     1444|      664|         0|
+---+-------------------+-------------+-----------+-----------+--------+---------+---------+----------+



In [16]:
# two dataframes: df_with_food and concat_df
#df_with_food.show()
#concat_df.show()

In [17]:
# reform the dataframe to prepare for tranforming to json
unioin_df = concat_df.union(df_with_food)
unioin_df = unioin_df.drop('id')
unioin_df = unioin_df.drop('timestamp')

# get food type according to food class
from pyspark.sql.functions import udf
from pyspark.sql.types import *

generate_rev_dict()
    
def get_food_type(food_class):
    the_class = str(food_class)
    if the_class in rev_dict.keys():
        return rev_dict[the_class]
    return None

get_food_type_udf = udf(get_food_type, StringType())
unioin_df = unioin_df.withColumn('food', get_food_type_udf(unioin_df['food_class']))
unioin_df = unioin_df.drop('food_class')

print(unioin_df.count())
unioin_df.show()

49
+-------------------+------------+-----------+--------+---------+---------+---------+
|               time|         lat|        lng|polarity|followers|following|     food|
+-------------------+------------+-----------+--------+---------+---------+---------+
|29-04-2018 03:01:56| -37.8136276|144.9630576|     0.0|     3965|     4321|    pizza|
|29-04-2018 03:04:03| -37.8136276|144.9630576|   0.658|      849|     3987|    pizza|
|29-04-2018 01:16:16| -37.8136276|144.9630576|  0.9366|     1444|      664|    pizza|
|26-04-2018 02:19:47| -37.7760072|144.9708071|  0.8883|      380|      564|    pizza|
|26-04-2018 12:44:13| -33.8688197|151.2092955|  0.4019|      766|      552|    pizza|
|26-04-2018 13:15:36| -37.8136276|144.9630576|     0.0|      853|      558|    pizza|
|26-04-2018 13:23:29|-33.86642251|151.2012542|     0.0|       37|      169|    pizza|
|26-04-2018 09:45:28|    -28.0183|   153.3921|   0.926|       84|      143|    pizza|
|26-04-2018 11:10:58| -33.8688197|151.2092955| -0.2

In [18]:
json_data = unioin_df.toJSON()

In [19]:
json_data.first()

'{"time":"29-04-2018 03:01:56","lat":-37.8136276,"lng":144.9630576,"polarity":0.0,"followers":3965,"following":4321,"food":"pizza"}'

In [20]:
# insert data into couchdb
my_db = Couch('test000')

final_json = {}
final_json["type"] = "FeatureCollection"
final_json["features"] = []

for row in json_data.collect():
    
    entry = {}
    entry["type"] = "Feature"
    entry["properties"] = {}
    entry["geometry"] = {}
    entry["geometry"]["type"] = "Point"
    entry["geometry"]["coordinates"] = []
    
    json_obj = json.loads(row)
    entry["properties"]["time"] = json_obj["time"]
    entry["properties"]["polarity"] = json_obj["polarity"]
    entry["properties"]["followers"] = json_obj["followers"]
    entry["properties"]["following"] = json_obj["following"]
    entry["properties"]["food"] = json_obj["food"]
    entry["geometry"]["coordinates"].append(json_obj["lat"])
    entry["geometry"]["coordinates"].append(json_obj["lng"])
    
    final_json["features"].append(entry)
    
my_db.insert(final_json)

Insert success


In [281]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

start_time = time()

metrics = BinaryClassificationMetrics(labels_and_predictions)
print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR * 100))
print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC * 100))

end_time = time()
elapsed_time = end_time - start_time
print("Time to evaluate model: %.3f seconds" % elapsed_time)

Area under Precision/Recall (PR) curve: 5
Area under Receiver Operating Characteristic (ROC) curve: 45.000
Time to evaluate model: 0.185 seconds
