
# TF-recomm


In [1]:
import time
from collections import deque
import socket
import sys
import numpy as np
import tensorflow as tf
from six import next
from tensorflow.core.framework import summary_pb2
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file

import dataio
import ops
# tensorboard --logdir=.\_zfp\data\my_graph
# tensorboard => http://localhost:6006 
# jupyter => http://localhost:8889

In [2]:
np.random.seed(13575)

BATCH_SIZE = 1000
USER_NUM = 6040
ITEM_NUM = 3952
DIM = 15
EPOCH_MAX = 100
DEVICE = "/cpu:0"

In [3]:
def clip(x):
    return np.clip(x, 1.0, 5.0)


def make_scalar_summary(name, val):
    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])

def get_data():
    df = dataio.read_process("./tmp/movielens/ml-1m/ratings.dat", sep="::")
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test, rows

def get_movies():
    df = dataio.read_movies("./tmp/movielens/ml-1m/movies.dat", sep="::")
    rows = len(df)
    return df, rows

# Data

In [4]:
df_train, df_test, length = get_data()
df_movies,rows = get_movies()

Movies file length:
3883
Toy Story (1995)
1
Animation|Children's|Comedy


In [5]:
df_train.iloc[0:10]
df_train.describe()

Unnamed: 0,user,item,rate,st
count,900188.0,900188.0,900188.0,900188.0
mean,3022.600601,1864.635758,3.581352,972246100.0
std,1728.384446,1096.003424,1.117216,12153770.0
min,0.0,0.0,1.0,956703900.0
25%,1504.0,1029.0,3.0,965302700.0
50%,3068.0,1834.0,4.0,973021500.0
75%,4475.0,2769.0,4.0,975221200.0
max,6039.0,3951.0,5.0,1046455000.0


In [6]:
print ("Movies file length: {}" .format(len(df_movies)))

Movies file length: 3883


In [7]:
df_movies.iloc[0:10]

Unnamed: 0,movie,title,tags
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [8]:
df_movies.iloc[0].title

'Toy Story (1995)'

In [9]:
# type(df_movies)

# Network + train + test

In [10]:

samples_per_batch = len(df_train) // BATCH_SIZE

iter_train = dataio.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                    df_train["rate"]],
                                    batch_size=BATCH_SIZE)

iter_test = dataio.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                    df_test["rate"]],
                                    batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = ops.inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM, device=DEVICE)
global_step = tf.contrib.framework.get_or_create_global_step()
_, train_op = ops.optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)
#zeros= tf.Variable(tf.zeros([1]),name="zeros")


In [11]:

def svd(train, test,length,moviefile, trainFl=False):
    init_op = tf.global_variables_initializer()
    saver=tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init_op)
        if trainFl == True: 
            summary_writer = tf.summary.FileWriter(logdir="./tmp/svd/log", graph=sess.graph)
            print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
            errors = deque(maxlen=samples_per_batch)
            start = time.time()
            for i in range(EPOCH_MAX * samples_per_batch):
                users, items, rates = next(iter_train)
                _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                    item_batch: items,
                                                                                                                        rate_batch: rates})
                pred_batch = clip(pred_batch)
                errors.append(np.power(pred_batch - rates, 2))
                if i % samples_per_batch == 0:
                    train_err = np.sqrt(np.mean(errors))
                    test_err2 = np.array([])
                    for users, items, rates in iter_test:
                        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                                item_batch: items})
                        pred_batch = clip(pred_batch)
                        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                    end = time.time()
                    test_err = np.sqrt(np.mean(test_err2))
                    print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err,
                                                        end - start))
                    train_err_summary = make_scalar_summary("training_error", train_err)
                    test_err_summary = make_scalar_summary("test_error", test_err)
                    summary_writer.add_summary(train_err_summary, i)
                    summary_writer.add_summary(test_err_summary, i)
                    start = end

            #meta_graph_def = tf.train.export_meta_graph(filename='/tmp/tfrecomm.meta')
            save_path=saver.save(sess,"./tmp/")
        else: 
            print("model restored")
            saver.restore(sess, "./tmp/")

In [12]:
svd(df_train, df_test, length,df_movies, trainFl=False) 
print("Done!")

model restored
INFO:tensorflow:Restoring parameters from ./tmp/
Done!


# EXECUTION

In [13]:
def printMM(topmovies):
    print (topmovies)
    print("TOP Movies")
    for i,r in topmovies:
        print("{0:5} - {1:1.2f} - {2}" .format(i,  r, df_movies.iloc[i].title  ))

In [14]:
def test(train, test,length,moviefile, data, trainFl=False):
    init_op = tf.global_variables_initializer()
    #saver=tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init_op)
        #saver.restore(sess, "./tmp/")

        movies=list(range(len(moviefile)))
        #print (movies)
        users=[1]
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)

        print ("\nTop Movies ------------------------------------------------------------")
        topmovies= smovies[0:10]
        #print (topmovies)
        printMM(topmovies)
        
        #-----------------------------------------------------------------------------
        
        print ("\n User - data {} ------------------------------------------------------------\n" .format(data))
        # give number between 1 - 5000
        del users[:]
        users.append(int(data))
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)
        topmovies= smovies[0:10]
        printMM(topmovies)
        for item in topmovies:
            itopmovie=item[0]
            recommendedmovie=moviefile["title"][itopmovie]
            recommendedtags=moviefile["tags"][itopmovie]
    return

In [15]:
test(df_train, df_test, length,df_movies, data=2, trainFl=False) 


Top Movies ------------------------------------------------------------
[(1771, -1.2675118), (3540, -1.2677344), (1786, -1.2677588), (26, -1.2678206), (1154, -1.2679795), (2325, -1.2680091), (3282, -1.2681811), (835, -1.2682223), (583, -1.2682619), (2407, -1.2682711)]
TOP Movies
 1771 - -1.27 - He Got Game (1998)
 3540 - -1.27 - Regret to Inform (1998)
 1786 - -1.27 - Krippendorf's Tribe (1998)
   26 - -1.27 - Now and Then (1995)
 1154 - -1.27 - Best of the Best 3: No Turning Back (1995)
 2325 - -1.27 - Prince of Egypt, The (1998)
 3282 - -1.27 - Two Thousand Maniacs! (1964)
  835 - -1.27 - Flirt (1995)
  583 - -1.27 - Ghost (1990)
 2407 - -1.27 - Heartbreak Ridge (1986)

 User - data 2 ------------------------------------------------------------

[(927, -1.3004175), (1771, -1.3004334), (2267, -1.3004442), (1548, -1.3004878), (3521, -1.3006651), (1922, -1.3007363), (1542, -1.3007843), (1720, -1.3008478), (965, -1.3008878), (192, -1.3009979)]
TOP Movies
  927 - -1.30 - Reluctant Debuta

# TO DO: 
* Read Users 
* Compare results for similar users 
* Show statistics


## Users Information
- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:
	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [16]:
import pandas as pd

In [17]:
def get_users():
    col_names = ["userID", "gender", "age", "occupation", "zipcode"]
    df = pd.read_csv("./tmp/movielens/ml-1m/users.dat", sep="::", header=None, names=col_names, engine='python')
    rows = len(df)
    return df, rows


In [18]:
df_users,rows = get_users()


In [19]:
print(len(df_users))
df_users.describe()

6040


Unnamed: 0,userID,age,occupation
count,6040.0,6040.0,6040.0
mean,3020.5,30.639238,8.146854
std,1743.742145,12.895962,6.329511
min,1.0,1.0,0.0
25%,1510.75,25.0,3.0
50%,3020.5,25.0,7.0
75%,4530.25,35.0,14.0
max,6040.0,56.0,20.0


In [20]:
df_users.iloc[0:10]

Unnamed: 0,userID,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


## FIND similar users 


In [21]:
def get_users_where(age="*", gender="*", occupation="*", zipcode="*"): 
    us = df_users
    if age != "*":
        us = us[us["age"]==age]
    if gender != "*":
        us = us[us["gender"]==gender]
    if occupation != "*":
        us = us[us["occupation"]==occupation]
    if zipcode != "*":
        us = us[us["zipcode"]==zipcode]    
    return us # return the data 
#     return len(us) return the length of each selection

# print(get_users_where(age=1) )
# print(get_users_where(age=1, gender ="M") )
print(get_users_where(age=1, gender ="M", occupation = 1 ) )
# print(get_users_where(age=1, gender ="M", occupation = 1, zipcode = 48067 ) )


      userID gender  age  occupation zipcode
1814    1815      M    1           1   30707
2059    2060      M    1           1   48304
2852    2853      M    1           1  444555


# Compare results - Show statistics

In [33]:
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init_op)
    #saver.restore(sess, "./tmp/")
    movies=list(range(len(df_movies)))

    user_IDEN = get_users_where(age=1, gender ="M", occupation = 1 )
    print ("\nTop Movies ------------------------------------------------------------")

    for i in range(len(user_IDEN)):
        userID = user_IDEN.iloc[i]["userID"]
        print(userID)
        users=[userID]
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)
        topmovies= smovies[0:10]
        printMM(topmovies)


Top Movies ------------------------------------------------------------
1815
[(3726, -0.699139), (2706, -0.69941115), (855, -0.69973123), (3066, -0.69974875), (1121, -0.69985551), (2901, -0.69997811), (1463, -0.70040929), (3378, -0.70047867), (1016, -0.70048547), (1637, -0.70050257)]
TOP Movies
 3726 - -0.70 - Five Senses, The (1999)
 2706 - -0.70 - Head On (1998)
  855 - -0.70 - Bound (1996)
 3066 - -0.70 - Great Santini, The (1979)
 1121 - -0.70 - Hustler White (1996)
 2901 - -0.70 - Fitzcarraldo (1982)
 1463 - -0.70 - Turbo: A Power Rangers Movie (1997)
 3378 - -0.70 - Good Earth, The (1937)
 1016 - -0.70 - Dumbo (1941)
 1637 - -0.70 - Wings of the Dove, The (1997)
2060
[(3817, -0.71547538), (459, -0.71554816), (526, -0.71557635), (2710, -0.7156769), (2848, -0.71575791), (672, -0.71587574), (1774, -0.71616536), (3063, -0.71655875), (3445, -0.71656179), (1408, -0.71668404)]
TOP Movies
 3817 - -0.72 - Went to Coney Island on a Mission From God... Be Back by Five (1998)
  459 - -0.72 