In [None]:
from zipfile import ZipFile
file_name = "travel-time-rec-master.zip"

with ZipFile(file_name, 'r') as zip1:
  zip1.extractall()
  print('Done')

Done


In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

!ls

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:13 https://developer.download.nvidia.com/compute/c

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# general filtering
def filter_user(user_df):
    null_age = user_df.ageRange.isnull()
    null_gender = user_df.gender.isnull()
    null_style = user_df.travelStyle.isnull()
    one_thou_pt = (user_df.totalPoints > 200)

    user_filtered = user_df[one_thou_pt][~null_age][~null_gender][~null_style]

    user_filtered = user_filtered[['username', 'ageRange', 'gender', 'travelStyle']]
    return user_filtered


def filter_review(review_df):

    attraction_only = review_df.type == 'Attractions'
    filtered_review_df = review_df[['id', 'username', 'type', 'title', 'text', 'rating', 'taObjectCity']]
    filtered_review_df = filtered_review_df[attraction_only]

    return filtered_review_df

def merge_review_and_user(user_df, review_df):

    merged_df = pd.merge(review_df, user_df, on=['username'])

    return merged_df

def foreign_review_filter(merged_df):

    span_mask1 = (merged_df.username == 'AnaS1')
    span_mask2 = (merged_df.username == 'DaniLK')
    span_mask3 = (merged_df.username == 'Aprile_24')
    non_city_mask = (merged_df.taObjectCity == 'California')
    non_relevant_mask = (merged_df.taObjectCity == 'Yellowstone National Park')

    merged_df = merged_df[~span_mask1][~span_mask2][~span_mask3][~non_city_mask][~non_relevant_mask]

    return merged_df



def popular_city_list(merged_df):

    popular_city = []
    for item, value in Counter(merged_df.taObjectCity).items():
        if value >= 12:
            popular_city.append(item)
    return popular_city



def filter_final(merged_df, popular_city_list):

    final_df = merged_df[merged_df.taObjectCity.isin(popular_city_list)]

    return final_df



# for user feature matrix


def user_feature_filter(final_df):

    feature_temp = final_df[['username', 'ageRange', 'gender', 'travelStyle']]
    feature_temp = feature_temp.drop_duplicates()

    return feature_temp

def travel_style(feature_temp):

    style_lst = [item.split(', ') for item in feature_temp.travelStyle]
    style_serie = pd.Series(style_lst)

    feature_temp['new_travel'] = style_serie.values

    return feature_temp

def travel_matrix(feature_temp):


    style_matrix = feature_temp['new_travel'].apply(pd.Series)
    style_df = pd.get_dummies(style_matrix.apply(pd.Series). \
                  stack()).sum(level=0). \
                  rename(columns = lambda x : x)

    return style_df

def age_gender_dummie(feature_temp):

    feature_temp = pd.get_dummies(feature_temp, \
                                  columns = ['ageRange', 'gender'])

    return feature_temp



def combine_all_dummies(user_df, style_df, personality_df):

    feature_temp = user_df.join(style_df)

    feature_final = feature_temp.drop(['travelStyle', \
                                       'new_travel', \
                                       'gender_male', \
                                       '60+ Traveler', \
                                       'username'], axis =1)

    feature_final.reset_index(drop=True, inplace=True)
    feature_final1 = feature_final.join(personality_df)

    return feature_final1




# user big 5 personality scores

def user_personality_score_merge(personality_df, user_temp):


    with_personality_df = pd.merge(personality_df, user_temp, on = 'username')
    only_per_df = with_personality_df.drop(['username', 'user_id'], axis=1 )

    return only_per_df


def mapping_personality(df):

    new_df = df.copy()
    for i in range(len((df.columns))):
        percentile = np.percentile(df.iloc[:, i], 50)

        new_items = np.array([True if item >= percentile else False for item in df.iloc[:, i]])
        new_df[str(i)]= new_items

    return new_df


def cleaning_personality_df(df):


    new_df = df.drop(['open', 'cons', 'extra', 'agree','neuro'], axis=1)
    new_df.columns = ['open', 'cons', 'extra', 'agree','neuro']

    return new_df


# prep for clustering


def cluster_prep_filter(final_df):

    cluster_input_df = final_df.copy()
    cluster_input_df = cluster_input_df[['title', 'text', 'taObjectCity']]

    return cluster_input_df


def grouping_city_title(cluster_input_df):
    df_title_comb = cluster_input_df.groupby(['taObjectCity']). \
                            apply(lambda x: ' '. \
                            join(x.title)). \
                            reset_index()

    return df_title_comb



def grouping_city_text(cluster_input_df):
    df_text_comb = cluster_input_df.groupby(['taObjectCity']). \
                                    apply(lambda x: ' '. \
                                    join(x.text)). \
                                    reset_index()
    return df_text_comb


def merging_content(left, right):

    cluster_input_df = left.merge(right, on= 'taObjectCity')
    cluster_input_df.columns = ['taObjectCity','title','text']
    cluster_input_df.set_index(['taObjectCity'], drop=True, inplace=True)

    return cluster_input_df


# selected cities from cluster



def selected_cities_in_cluster(cluster_df, city_cluster_idx):


    cluster2_mask = (cluster_df['cluster_k'] == city_cluster_idx)
    up_cluster_df = cluster_df[cluster2_mask]
    up_cluster_df.columns = ['cluster_k', 'taObjectCity']

    return up_cluster_df

def selected_city_df(up_cluster_df, city_df):

    selected_df = pd.merge(up_cluster_df, city_df, how = 'left', on = 'taObjectCity')

    return selected_df


In [None]:
import pandas as pd
import os


# traveler profile
def load_user_profile(file_path):
    df = pd.read_excel(file_path)
    return df

# traveler articles
def load_articles(file_path):
    df = pd.read_excel(file_path)
    return df

# traveler and reviews
def load_reviews(file_path):
    df = pd.read_excel(file_path)
    return df

# articles by some traveler
def load_personality_scores(file_path):
    df = pd.read_excel(file_path)
    return df


In [None]:
# prep for spark ALS model

import pandas as pd


def prep_als_df(final_df):

    asl_temp_df = final_df[['username', 'taObjectCity', 'rating']]

    return asl_temp_df


def unique_user_id(input_df):

    user_dict_df = pd.DataFrame(input_df.username.unique(), columns = ['username'])
    user_temp = user_dict_df.reset_index()
    user_temp = user_temp.rename(columns = {'index':'user_id'})

    return user_temp


def unique_city_id(input_df):
    city_dict_df = pd.DataFrame(input_df.taObjectCity.unique(), columns = ['taObjectCity'])
    city_temp = city_dict_df.reset_index()
    city_temp = city_temp.rename(columns = {'index':'city_id'})
    return city_temp


def merging_unique_user_city(left_df, right_df_1, right_df_2):

    up_temp_df = pd.merge(left_df, right_df_1, on = 'username')
    result_df = pd.merge(up_temp_df, right_df_2, on = 'taObjectCity')

    return result_df


def utility_matrix(result_df):

    agg_dict = {'rating':'median'}
    util_matrix = result_df.groupby(['user_id', 'city_id']).agg(agg_dict).reset_index()

    return util_matrix


In [None]:
#general
import pandas as pd
import numpy as np


# spark
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SQLContext, Row

import sys
import os



def spark_rdd(util_matrix):
    # Build our Spark Session and Context
    spark = pyspark.sql.SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    #spark, sc
    sqlContext = SQLContext(sc)


    schema = StructType( [
    StructField('user', IntegerType(), True),
    StructField('city', IntegerType(), True),
    StructField('rating', FloatType(), True)]
    )

    spark_df = sqlContext.createDataFrame(util_matrix, schema)

    return spark_df



def ALS_model(spark_df):

    #train, test = spark_df.randomSplit([0.85, 0.15], seed=427471138)

    als_model = ALS(userCol='user',
                    itemCol='city',
                    ratingCol='rating',
                    nonnegative=True,
                    regParam=0.1,
                    rank=15
                   )

    #als_recommender = als_model.fit(train)
    als_recommender = als_model.fit(spark_df)

    user_factor_df = als_recommender.userFactors.toPandas()
    item_factor_df = als_recommender.itemFactors.toPandas()

    user_factor_df.to_pickle('user_factor_df.pkl')
    item_factor_df.to_pickle('item_factor_df.pkl')



if __name__ == '__main__':
    # loading dataframe
    reviews_file = 'travel-time-rec-master/data/reviews_32618_for_1098_users_with_location.xlsx'
    user_path = 'travel-time-rec-master/data/users_full_7034.xlsx'

    u_df = load_user_profile(user_path)
    r_df = load_reviews(reviews_file)


    # filtering dataframe
    u_filtered = filter_user(u_df)
    r_filtered = filter_review(r_df)
    merge_filtered = merge_review_and_user(u_filtered, r_filtered)
    merge_filtered = foreign_review_filter(merge_filtered)
    pop_city_lst = popular_city_list(merge_filtered)
    final_df = filter_final(merge_filtered, pop_city_lst)


    # prep df for spark rdd
    als_temp_df = prep_als_df(final_df)
    user_temp = unique_user_id(als_temp_df)
    city_temp = unique_city_id(als_temp_df)
    # with username, cityname, rating, user_id, city_id
    result_df = merging_unique_user_city(als_temp_df, user_temp, city_temp)
    # with user_id, city_id, rating - aggregated by cityid and userid
    util_matrix = utility_matrix(result_df)


    spark_df = spark_rdd(util_matrix)
    ALS_model(spark_df)


    print("DONE MODELING! "*5)


  if sys.path[0] == '':


DONE MODELING! DONE MODELING! DONE MODELING! DONE MODELING! DONE MODELING! 


In [None]:
#general
import pandas as pd
import numpy as np
import pickle
# cluster

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

# similarity


class TravelModelMain():

    def __init__(self, user_df, item_df):

        self.user_df = user_df
        self.item_df = item_df
        # predict --------
        #input
        #self.k_recommendation = k_recommendation

    def cluster_texts(self, corpus):
        """
        Transform texts to Tf-Idf coordinates and cluster texts using K-Means
        """
        #my_additional_stop_words = ['acute', 'good', 'great', 'really', 'just', 'nice', 'like', 'day']
        # my_additional_stop_words = ['acute', 'good', 'great', 'really', 'just', 'nice',
        #                             'like', 'day', 'beautiful', 'visit', 'time', 'don',
        #                             'did', 'place', 'didn', 'did', 'tour', 'sydney','pm',
        #                             'lot', '00', 'inside', 'istanbul', 'doesn','going',
        #                             'right', '15']
        my_additional_stop_words = ['acute', 'good', 'great', 'really',
                                    'just', 'nice', 'like', 'day', 'ok',
                                    'visit', 'did', 'don', 'place', 'london',
                                    'paris','san', 'sydney', 'dubai','diego',
                                    'didn', 'fun', 'venice','boston', 'chicago',
                                    'tour', 'went', 'time', 'vegas', 'museum',
                                    'disney', 'barcelona', 'st', 'pm', 'sf',
                                    'worth', 'beautiful', 'la', 'interesting',
                                    'inside', 'outside', 'experience', 'singapore',
                                    'lot', 'free', 'istanbul', 'food', 'people',
                                    'way']
        stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)


        vectorizer = TfidfVectorizer(stop_words= stop_words,
                                     max_features = 500,
                                     lowercase=True)

        tfidf_model = vectorizer.fit_transform(corpus)
        vectors = tfidf_model.toarray()
        cols = vectorizer.get_feature_names()

        return (vectors, cols)


    def cluster(self, vectors, cols, reviews):
        """
        Cluster vecotirzed reviews and create k books a data frame relating the
        k label to the book id
        """
        kmeans = KMeans(4, random_state=10000000).fit(vectors)
        k_books = pd.DataFrame(list(zip(list(kmeans.labels_),
                                    list(reviews.index))),
                                    columns=['cluster_k', 'city_index'])

        ''' added code to print centriod vocab - Print the top n words from all centroids vocab
        '''
        n = 15
        centroids = kmeans.cluster_centers_
        for ind, c in enumerate(centroids):
            #print(ind)
            indices = c.argsort()[-1:-n-1:-1]
            #print([cols[i] for i in indices])
            #print("=="*20)

        #print(k_books.head(190))
        return k_books


    def fit(self, utility_matrix, invert_feature, city_temp, content, reviews):


        self.utility_matrix = utility_matrix
        self.invert_feature = invert_feature
        #print('=========invert_feature=========', invert_feature.tail(10))
        print()
        self.city_temp = city_temp
        #print('=====city_temp=======' ,city_temp.head(80))

        vector, cols = self.cluster_texts(content)
        self.cluster_df = self.cluster(vector, cols, reviews)


################################################################################################



    def predict(self, cluster_id, user_id):


        up_cluster_df = selected_cities_in_cluster(self.cluster_df, cluster_id)
        selected_df = selected_city_df(up_cluster_df, self.city_temp)

        final_rating_lst = []

        for city in selected_df.city_id:
            user_i = user_id
            item = city

            user_arr = self.user_df.features[user_i]
            city_arr = self.item_df[self.item_df.id == item].features.to_numpy()[0]

            als_score = np.dot(user_arr, city_arr)

            final_sim_score = self.jaccard_sim_score(user_i, item, self.invert_feature, self.utility_matrix)
            final_rating = self.overall_rating(als_score, final_sim_score)
            final_pair = (final_rating, item)
            final_rating_lst.append(final_pair)


        rec_cities = self.top_list(final_rating_lst, selected_df)

        return rec_cities



    def jaccard_sim_score(self, udi, cid, user_matrix, util_matrix):
        '''
        takes in user(index) and item
        returns jaccard similarity score
        '''
        overall_rating = 0
        overall_sim = 0
        final_score = 0

        filtered_user = util_matrix[util_matrix.city_id == cid]
        #print(user_matrix.head(10))
        #print(filtered_user)

        for user in filtered_user.user_id.values:

            #print('***type******', user_matrix[user])
            # import pdb; pdb.set_trace()
            sim_score = sklearn.metrics.jaccard_score(list(user_matrix[udi].values), list(user_matrix[user].values))
            rating = filtered_user[(filtered_user.user_id == user)].rating.values[0]
            overall_rating += sim_score * rating
            overall_sim +=sim_score

        print('***type******', list(user_matrix[udi].values))
        final_score = overall_rating / overall_sim

        return final_score


    def overall_rating(self, als_score ,jacc_sim_score):
        alpha = 0.3
        beta = 0.7
        if als_score == 0:
            final_score = jacc_sim_score
        else:
            final_score = alpha * jacc_sim_score + beta * als_score

        return final_score



    def top_list(self, final_rating_lst, selected_df):

        top_lst = sorted(final_rating_lst, reverse = True)[:3]


        rec_items = []
        for rating, city in top_lst:
            row = selected_df.loc[selected_df['city_id'] == city]
            rec_city = row.taObjectCity.values
            rec_items.append(rec_city[0])

        return rec_items


In [None]:
import sys

import pandas as pd
import numpy as np
import pickle



# loading dataframe
reviews_file = 'travel-time-rec-master/data/reviews_32618_for_1098_users_with_location.xlsx'
user_file = 'travel-time-rec-master/data/users_full_7034.xlsx'
personality_file = 'travel-time-rec-master/data/pers_scores_1098.xlsx'
article_file = 'travel-time-rec-master/data/articles_159.xlsx'

u_df = load_user_profile(user_file)
#df2 = load.load_articles(article_file)
r_df = load_reviews(reviews_file)
p_df = load_personality_scores(personality_file)


# filtering dataframe
u_filtered = filter_user(u_df)
r_filtered = filter_review(r_df)
merge_filtered = merge_review_and_user(u_filtered, r_filtered)
merge_filtered = foreign_review_filter(merge_filtered)
pop_city_lst = popular_city_list(merge_filtered)
#pop_city_lst = gn.popular_city_list(merge_filtered)
final_df = filter_final(merge_filtered, pop_city_lst)



# prep df for spark rdd
als_temp_df = prep_als_df(final_df)
user_temp = unique_user_id(als_temp_df)
city_temp = unique_city_id(als_temp_df)
#print('city_temp:', city_temp.head(120))
# with username, cityname, rating, user_id, city_id
result_df = merging_unique_user_city(als_temp_df, user_temp, city_temp)
# with user_id, city_id, rating - aggregated by cityid and userid
util_matrix = utility_matrix(result_df)




only_per_df = user_personality_score_merge(p_df, user_temp)
personality_matrix_df = mapping_personality(only_per_df)
c_personality_matrix_df = cleaning_personality_df(personality_matrix_df)


# user-feature matrix
feature_temp_0 = user_feature_filter(final_df)
feature_temp = travel_style(feature_temp_0)
style_df = travel_matrix(feature_temp)
feature_temp_1 = age_gender_dummie(feature_temp_0)
invert_feature = combine_all_dummies(feature_temp_1, style_df, c_personality_matrix_df).T




# clustering
#prep
cluster_input_df = cluster_prep_filter(final_df)
#print('final_df', final_df.head(120))
#print('cluster_input_df', cluster_input_df.head(120))
df_title_comb = grouping_city_title(cluster_input_df)
df_text_comb = grouping_city_text(cluster_input_df)
cluster_final = merging_content(df_title_comb, df_text_comb)




reviews = cluster_final.title + ' ' + cluster_final.text
content = [i for i in reviews]
#desire_clusters = 5


# ------model--------


user_df = pd.read_pickle('user_factor_df.pkl')
item_df = pd.read_pickle('item_factor_df.pkl')
#travel_m = TravelModelMain(desire_clusters)

travel_m = TravelModelMain(user_df, item_df)
# ------fit--------
#print('+++++invert_feature+++++', invert_feature.tail(10))
travel_m.fit(util_matrix, invert_feature, city_temp, content, reviews)

pickle.dump(travel_m, open('samp.p', 'wb'))


  if sys.path[0] == '':





