In [79]:
import sys
import os
os.environ['SPARK_HOME'] = "spark"
sys.path.append("spark/python")
sys.path.append("spark/python/lib")

In [80]:
# get coordinates of a given city
import httplib2
import json


def cityPos(name):
    url = "https://maps.googleapis.com/maps/api/geocode/json?" + \
          "key=AIzaSyBsZErhxaT1oVgMrT-xGLcAN5nK3UHeGBU&address=" + name
    req = httplib2.Http(".cache")
    resp, content = req.request(url, "GET")
    res = json.loads(content)
    return res["results"][0]["geometry"]

In [81]:
# reform the data preparing for fitting the model
import csv
import json
import codecs
import time as t
from couch import Couch

#COUCHDB_NAME = "cl_richard"
COUCHDB_NAME = "classified1"
REFORMED_FILE = "data/output0.csv"

food_dict = {}
rev_dict = {}

def trans(path):
    con = Couch(COUCHDB_NAME)
    jsonData = con.query_all()

    csvfile = open(REFORMED_FILE, 'w', newline='')
    writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    keys=['id', 'time', 'timestamp', 'lat', 'lng', 'polarity', 'followers', 'following', 'homeless', \
          'homeless_trend', 'food_class']
    writer.writerow(keys)
    
    i = 0
    for dic in jsonData:
        # get coordinates
        if dic['location']['coordinates'] is None:
            city = dic['location']['place_name']
            city = city.replace(" ","%20")
            coor = cityPos(city)
            lng = coor['location']['lng']
            lat = coor['location']['lat']
        else:
            lng = dic['location']['coordinates'][0]
            lat = dic['location']['coordinates'][1]
            
        # get time amd timesptamp
        time = dic['created_at']['day']+'-'+trans_month(dic['created_at']['month'])+'-'+dic['created_at']['year']+\
                ' '+dic['created_at']['time']
        timeArray = t.strptime(time, "%d-%m-%Y %H:%M:%S")
        timestamp = t.mktime(timeArray)
        
        # to ensure at least one of homeless info and food info appears
        home = dic['homeless']
        foods = dic['food_list']
        if home is None and foods is None:
            continue
        
        # get homeless information
        if home is None:
            homeless = -1
            homeless_trend = 0
        else:
            try:
                homeless = dic['homeless']['cnt16']
                homeless_trend = dic['homeless']['incre/decre']
            except:
                continue
        # get food
        if foods is None or len(foods) == 0:
            writer.writerow([i, time, timestamp, lat, lng, dic['polarity'], dic['user']['followers'], \
                             dic['user']['following'], homeless, homeless_trend, "-1"])
            i += 1
        else:
            for food in foods:
                food_class = get_food_class(food)
                writer.writerow([i, time, timestamp, lat, lng, dic['polarity'], dic['user']['followers'], \
                             dic['user']['following'], homeless, homeless_trend, food_class])
                i += 1
    csvfile.close()
    
def trans_month(month):
    month_dic = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', \
                 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    return month_dic[month]

def get_food_class(food):
    if not food in food_dict.keys():
        food_dict[food] = str(len(food_dict))
    return food_dict[food]

def generate_rev_dict():
    for key,value in food_dict.items():
        rev_dict[value] = key


In [82]:
APP_NAME = "random forest model"
SPARK_URL = "local[*]"
RANDOM_SEED = 12345
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 10
RF_MAX_DEPTH = 5
RF_NUM_BINS = 32

In [83]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()

In [84]:
# read data from couchdb and reform them into a dataframe
trans(REFORMED_FILE)

df = spark.read \
    .options(header = "true", inferschema = "true") \
    .csv(REFORMED_FILE)

print("Total number of rows: %d" % df.count())

Total number of rows: 46




In [85]:
df.show()

+---+-------------------+-------------+------------+-----------+--------+---------+---------+--------+--------------+----------+
| id|               time|    timestamp|         lat|        lng|polarity|followers|following|homeless|homeless_trend|food_class|
+---+-------------------+-------------+------------+-----------+--------+---------+---------+--------+--------------+----------+
|  0|26-04-2018 02:19:47|1.524673187E9| -37.7760072|144.9708071|  0.8883|      380|      564|       4|            -9|         0|
|  1|26-04-2018 12:44:13|1.524710653E9| -33.8688197|151.2092955|  0.4019|      766|      552|      -1|             0|         0|
|  2|26-04-2018 13:15:36|1.524712536E9| -37.8136276|144.9630576|     0.0|      853|      558|      -1|             0|         0|
|  3|26-04-2018 13:23:29|1.524713009E9|-33.86642251|151.2012542|     0.0|       37|      169|    5061|          2024|         0|
|  4|26-04-2018 09:45:28|1.524699928E9|    -28.0183|   153.3921|   0.926|       84|      143|    

In [86]:
# filter dataframe
df_no_food = df.filter(df['food_class'] == -1)
df_no_homeless = df.filter(df['homeless'] == -1)
df_all_info = df.filter(df['food_class'] >= 0).filter(df['homeless'] >= 0)

print("Number of rows having all information: %d" % df_all_info.count())
print("number of rows without food information: %d" % df_no_food.count())
print("number of rows without homeless information: %d" % df_no_homeless.count())

Number of rows having all information: 20
number of rows without food information: 0
number of rows without homeless information: 26


In [87]:
# transform dataframe into RDD and split reformed data into tranning data and test data
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

transformed_df_food = df_all_info.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[2:-1])))
transformed_df_homeless = df_all_info.rdd.map(lambda row: LabeledPoint(row[-3], Vectors.dense(row[2],row[3],row[4],row[5],row[6],row[7],row[10])))
transformed_df_homeless_trend = df_all_info.rdd.map(lambda row: LabeledPoint(row[-2], Vectors.dense(row[2],row[3],row[4],row[5],row[6],row[7],row[10])))

splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data_food, test_data_food = transformed_df_food.randomSplit(splits, RANDOM_SEED)
training_data_homeless, test_data_homeless = transformed_df_homeless.randomSplit(splits, RANDOM_SEED)
training_data_homeless_trend, test_data_homeless_trend = transformed_df_homeless_trend.randomSplit(splits, RANDOM_SEED)

print("Number of training set rows: %d" % training_data_food.count())
print("Number of test set rows: %d" % test_data_food.count())

Number of training set rows: 15
Number of test set rows: 5


In [88]:
# train the classification model using training data
from pyspark.mllib.tree import RandomForest
from time import *

start_time = time()
num_classes = len(food_dict)

model_food_classifier = RandomForest.trainClassifier(training_data_food, numClasses=num_classes, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=RF_MAX_DEPTH, maxBins=32, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train food classifier: %.3f seconds" % elapsed_time)

Time to train food classifier: 0.268 seconds


In [89]:
# train the regression model using training data
start_time = time()

model_homeless_regressor = RandomForest.trainRegressor(training_data_homeless, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="variance", \
    maxDepth=RF_MAX_DEPTH, maxBins=32, seed=RANDOM_SEED)

model_homeless_trend_regressor = RandomForest.trainRegressor(training_data_homeless_trend, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="variance", \
    maxDepth=RF_MAX_DEPTH, maxBins=32, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train homeless regressor: %.3f seconds" % elapsed_time)

Time to train homeless regressor: 0.524 seconds


In [90]:
# make predictions using test data and calculate the accuracy
food_predictions = model_food_classifier.predict(test_data_food.map(lambda x: x.features))
homeless_predictions = model_homeless_regressor.predict(test_data_homeless.map(lambda x: x.features))
homeless_trend_predictions = model_homeless_trend_regressor.predict(test_data_homeless_trend.map(lambda x: x.features))


labels_and_predictions_food = test_data_food.map(lambda x: x.label).zip(food_predictions)
labels_and_predictions_homeless = test_data_homeless.map(lambda x: x.label).zip(homeless_predictions)
labels_and_predictions_homeless_trend = test_data_homeless_trend.map(lambda x: x.label).zip(homeless_trend_predictions)

food_acc = labels_and_predictions_food.filter(lambda x: x[0] == x[1]).count() / float(test_data_food.count())
homeless_acc = labels_and_predictions_homeless.filter(lambda x: abs(x[0]-x[1]) < 100).count() / float(test_data_homeless.count())
homeless_trend_acc = labels_and_predictions_homeless_trend.filter(lambda x: abs(x[0]-x[1]) < 100).count() / float(test_data_homeless_trend.count())

print("Food classifier accuracy: %.3f%%" % (food_acc * 100))
print("Homeless regressor accuracy: %.3f%%" % (homeless_acc * 100))
print("Homeless trend regressor accuracy: %.3f%%" % (homeless_trend_acc * 100))



Food classifier accuracy: 80.000%
Homeless regressor accuracy: 0.000%
Homeless trend regressor accuracy: 40.000%


In [91]:
food_pre = df_no_food.count() > 0
homeless_pre = df_no_homeless.count() > 0

# make food predictions
if food_pre:
    transformed_df_no_food = df_no_food.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[2:-1])))
    predict_foods = model_food_classifier.predict(transformed_df_no_food.map(lambda x: x.features))

# make homeless predictions
if homeless_pre:
    transformed_df_no_homeless = df_no_homeless.rdd.map(lambda row: LabeledPoint(row[8], Vectors.dense(row[2],row[3],row[4],row[5],row[6],row[7],row[10])))
    transformed_df_no_homeless_trend = df_no_homeless.rdd.map(lambda row: LabeledPoint(row[9], Vectors.dense(row[2],row[3],row[4],row[5],row[6],row[7],row[10])))
    predict_homeless = model_homeless_regressor.predict(transformed_df_no_homeless.map(lambda x: x.features))
    predict_homeless_trend = model_homeless_trend_regressor.predict(transformed_df_no_homeless_trend.map(lambda x: x.features))
    

In [92]:
# combine id with predictions
if food_pre:
    rdd_predict_foods = df_no_food.rdd.map(lambda row: row[0]).zip(predict_foods.map(int))
    list_predict_foods = rdd_predict_foods.collect()
if homeless_pre:
    rdd_predict_homeless = df_no_homeless.rdd.map(lambda row: row[0]).zip(predict_homeless.map(int))
    rdd_predict_homeless_trend = df_no_homeless.rdd.map(lambda row: row[0]).zip(predict_homeless_trend.map(int))
    list_predict_homeless = rdd_predict_homeless.collect()
    list_predict_homeless_trend = rdd_predict_homeless_trend.collect()

In [93]:
# transform predicted rdd to dataframe and join it to original data that without food
if food_pre:
    df_predict_foods = spark.createDataFrame(list_predict_foods, schema=["id","food_class"])
    df_no_food = df_no_food.drop('food_class')
    concat_df_food = df_no_food.join(df_predict_foods, on='id')
    
if homeless_pre:
    df_predict_homeless = spark.createDataFrame(list_predict_homeless, schema=["id","homeless"])
    df_predict_homeless_trend = spark.createDataFrame(list_predict_homeless_trend, schema=["id","homeless_trend"])
    
    df_no_homeless = df_no_homeless.drop('homeless').drop('homeless_trend')
    concat_df_homeless = df_no_homeless.join(df_predict_homeless, on='id').join(df_predict_homeless_trend, on='id')

In [94]:
concat_df_homeless.show()

+---+-------------------+-------------+-----------+-----------+--------+---------+---------+----------+--------+--------------+
| id|               time|    timestamp|        lat|        lng|polarity|followers|following|food_class|homeless|homeless_trend|
+---+-------------------+-------------+-----------+-----------+--------+---------+---------+----------+--------+--------------+
| 26|29-04-2018 06:25:24|1.524947124E9|-38.3686779|142.4982086|     0.0|      775|      745|         0|    1568|            52|
| 19|28-04-2018 10:58:05|1.524877085E9|-36.8875485|149.9058748|  0.7845|      110|      142|         0|    1215|           378|
| 43|26-04-2018 04:24:41|1.524680681E9|-37.8136276|144.9630576|     0.0|      142|       68|         0|    2468|           769|
| 39|26-04-2018 07:32:15|1.524691935E9|-27.4697707|153.0251235|     0.0|      716|      625|         0|    1717|           299|
|  6|26-04-2018 11:10:58|1.524705058E9|-33.8688197|151.2092955| -0.2484|      154|      219|         1| 

In [95]:
df_all_info.show()

+---+-------------------+-------------+------------+------------+--------+---------+---------+--------+--------------+----------+
| id|               time|    timestamp|         lat|         lng|polarity|followers|following|homeless|homeless_trend|food_class|
+---+-------------------+-------------+------------+------------+--------+---------+---------+--------+--------------+----------+
|  0|26-04-2018 02:19:47|1.524673187E9| -37.7760072| 144.9708071|  0.8883|      380|      564|       4|            -9|         0|
|  3|26-04-2018 13:23:29|1.524713009E9|-33.86642251| 151.2012542|     0.0|       37|      169|    5061|          2024|         0|
|  4|26-04-2018 09:45:28|1.524699928E9|    -28.0183|    153.3921|   0.926|       84|      143|    1708|           368|         0|
|  7|26-04-2018 02:58:37|1.524675517E9|   -33.88888|   151.27759|     0.0|     4111|     4550|     165|            44|         0|
| 10|29-04-2018 05:15:22|1.524942922E9|    -37.8823|       144.7|   0.296|      363|      

In [96]:
# get food type according to food class
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from keywords import Keywords

generate_rev_dict()
    
# get food name by food class
def get_food_type(food_class):
    the_class = str(food_class)
    if the_class in rev_dict.keys():
        return rev_dict[the_class]
    return None
get_food_type_udf = udf(get_food_type, StringType())

# get food group by food name
def get_food_group(food):
        if food in Keywords.fastfood:
            return "fastfood"
        if food in Keywords.fruits:
            return "fruits"
        if food in Keywords.grains:
            return "grains"
        if food in Keywords.meat:
            return "meat"
        if food in Keywords.seafood:
            return "seafood"
        if food in Keywords.vegetables:
            return "vegetables"
        return None
get_food_group_udf = udf(get_food_group, StringType())

df_all_info = df_all_info.withColumn('food', get_food_type_udf(df_all_info['food_class']))
df_all_info = df_all_info.drop('food_class')

# reform the dataframe to prepare for tranforming to json
if food_pre:
    concat_df_food = concat_df_food.withColumn('food', get_food_type_udf(concat_df_food['food_class']))
    concat_df_food = concat_df_food.drop('food_class')

    unioin_df = df_all_info.union(concat_df_food)
else:
    unioin_df = df_all_info
    
    
if homeless_pre:
    concat_df_homeless = concat_df_homeless.withColumn('food', get_food_type_udf(concat_df_homeless['food_class']))
    concat_df_homeless = concat_df_homeless.drop('food_class')
    
    unioin_df = unioin_df.union(concat_df_homeless)

    
unioin_df = unioin_df.drop('id')
unioin_df = unioin_df.drop('timestamp')

unioin_df = unioin_df.withColumn('food_group', get_food_group_udf(unioin_df['food']))

print(unioin_df.count())
unioin_df.show()

45
+-------------------+------------+------------+--------+---------+---------+--------+--------------+-------+----------+
|               time|         lat|         lng|polarity|followers|following|homeless|homeless_trend|   food|food_group|
+-------------------+------------+------------+--------+---------+---------+--------+--------------+-------+----------+
|26-04-2018 02:19:47| -37.7760072| 144.9708071|  0.8883|      380|      564|       4|            -9|  pizza|  fastfood|
|26-04-2018 13:23:29|-33.86642251| 151.2012542|     0.0|       37|      169|    5061|          2024|  pizza|  fastfood|
|26-04-2018 09:45:28|    -28.0183|    153.3921|   0.926|       84|      143|    1708|           368|  pizza|  fastfood|
|26-04-2018 02:58:37|   -33.88888|   151.27759|     0.0|     4111|     4550|     165|            44|  pizza|  fastfood|
|29-04-2018 05:15:22|    -37.8823|       144.7|   0.296|      363|      472|     750|           120|  pizza|  fastfood|
|29-04-2018 05:15:22|    -37.8823|   

In [97]:
json_data = unioin_df.toJSON()

In [98]:
json_data.first()

'{"time":"26-04-2018 02:19:47","lat":-37.7760072,"lng":144.9708071,"polarity":0.8883,"followers":380,"following":564,"homeless":4,"homeless_trend":-9,"food":"pizza","food_group":"fastfood"}'

In [100]:
# insert data into couchdb
my_db = Couch('test002')

final_json = {}
final_json["type"] = "FeatureCollection"
final_json["features"] = []

#i = 0
for row in json_data.collect():
    #print(i)
    entry = {}
    entry["type"] = "Feature"
    entry["properties"] = {}
    entry["geometry"] = {}
    entry["geometry"]["type"] = "Point"
    entry["geometry"]["coordinates"] = []
    
    json_obj = json.loads(row)
    entry["properties"]["time"] = json_obj["time"]
    entry["properties"]["polarity"] = json_obj["polarity"]
    entry["properties"]["followers"] = json_obj["followers"]
    entry["properties"]["following"] = json_obj["following"]
    entry["properties"]["food"] = json_obj["food"]
    entry["properties"]["food_group"] = json_obj["food_group"]
    entry["properties"]["homeless"] = json_obj["homeless"]
    entry["properties"]["homeless_trend"] = json_obj["homeless_trend"]
    entry["geometry"]["coordinates"].append(json_obj["lat"])
    entry["geometry"]["coordinates"].append(json_obj["lng"])
    
    final_json["features"].append(entry)
    #i += 1
my_db.insert(final_json)

Insert success
