
# TF-recomm

![Pic1](./graph_svd.png)

users.dat_ => UserID::Gender::Age::Occupation::Zip-code<br>
movies.dat  => MovieID::Title::Genres<br>
ratings.dat => UserID::MovieID::Rating::Timestamp<br>

# imports

In [1]:
import time
from collections import deque
import socket
import sys
import numpy as np
import tensorflow as tf
from six import next
from tensorflow.core.framework import summary_pb2
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
# tensorboard --logdir=.\_zfp\data\my_graph
# tensorboard => http://localhost:6006 
# jupyter => http://localhost:8889

# import dataio
# import ops

In [2]:
import random
import matplotlib.pyplot as plt
# from wordcloud import WordCloud, STOPWORDS #used to generate world cloud

from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd

# OPS

In [None]:
# SVD = Singular Value Decomposition 
def inference_svd(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        bias_global = tf.get_variable("bias_global", shape=[])
        w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
        w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
        w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    with tf.device(device):
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name="svd_regularizer")
    return infer, regularizer


def optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
    global_step = tf.train.get_global_step()
    assert global_step is not None
    with tf.device(device):
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)
    return cost, train_op


# DATAIO

In [None]:
def read_movies(filname, sep="::"):
    col_names = ["movie", "title", "tags"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    #print (df)
    print ("Movies file length:")
    print (len(df))	

    print (df["title"][0])
    print (df["movie"][0])
    print (df["tags"][0])
    df["movie"] = df["movie"].astype(np.int32)
    return df

def read_process(filname, sep="\t"):
    col_names = ["user", "item", "rate", "st"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    df["user"] -= 1
    df["item"] -= 1
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
    df["rate"] = df["rate"].astype(np.float32)
    return df

In [None]:
class ShuffleIterator(object):
    """
    Randomly generate batches
    """
    def __init__(self, inputs, batch_size=10):
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

    def __len__(self):
        return self.len

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]

In [None]:
class OneEpochIterator(ShuffleIterator):
    """
    Sequentially generate one-epoch batches, typically for test data
    """
    def __init__(self, inputs, batch_size=10):
        super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0

    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

# start! 

In [3]:
np.random.seed(13575)

BATCH_SIZE = 1000
USER_NUM = 6040
ITEM_NUM = 3952
DIM = 15
EPOCH_MAX = 100
DEVICE = "/cpu:0"

In [4]:
def clip(x):
    return np.clip(x, 1.0, 5.0)


def make_scalar_summary(name, val):
    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])

def get_data():
    df = read_process("../../_data_tmp/movielens/ml-1m/ratings.dat", sep="::")
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test, rows

def get_movies():
    df = read_movies("../../_data_tmp/movielens/ml-1m/movies.dat", sep="::")
    rows = len(df)
    return df, rows

# Data

In [5]:
df_train, df_test, length = get_data()
df_movies,rows = get_movies()

Movies file length:
3883
Toy Story (1995)
1
Animation|Children's|Comedy


In [6]:
print("ratings: {} - tr{} - ts{}" .format(length, len(df_train),len(df_test) ))
df_ratings = pd.concat([df_train, df_test ] )
len(df_ratings)

ratings: 1000209 - tr900188 - ts100021


1000209

In [7]:
# print(df_train.iloc[0:10])
# df_train.describe()

In [8]:
print ("Movies file length: {}" .format(len(df_movies)))

Movies file length: 3883


In [9]:
df_movies.iloc[0:10]

Unnamed: 0,movie,title,tags
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


# Basic VIS

In [25]:
print ("Movies file length: {}" .format(len(df_movies)))

Movies file length: 3883


In [26]:
df_movies.iloc[0].title

'Toy Story (1995)'

In [28]:
# type(df_movies)
# get titles
# df_movies.head() # display first 5entries
df_movies.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
movie    3883 non-null int32
title    3883 non-null object
tags     3883 non-null object
dtypes: int32(1), object(2)
memory usage: 75.9+ KB


In [33]:
#number of unique movies
movies = df_movies['movie'].unique().tolist()
print("unique number of movies: " + str(len(movies)) )

unique number of movies: 3883


In [17]:
# df_ratings.describe() #min 1 - max 5

In [65]:
df_ratings.head()

Unnamed: 0,user,item,rate,st
0,2662,3209,4.0,973459383
1,1752,1911,5.0,974702638
2,2105,2452,1.0,974755210
3,1779,3051,2.0,1027037687
4,1697,1526,3.0,974773234


In [57]:
# data clean up: 
df_movies.shape # 3883, 3
df_movies.isnull().any() #is any row null: False 
df_ratings.shape #1000209 4
df_ratings.isnull().any() # False! 
# ? tags... ml2-m ... 
# tags_data=pd.read_csv('../input/tags.csv',sep=',')
# tags_data.shape
# tags_data.isnull().any()
# tags_data.dopna()
# unique_tags=tags_data['tag'].unique().tolist()
# len(unique_tags)
print("data clean up")

data clean up


In [18]:
comedy_movies = df_movies['tags'].str.contains('Comedy')
# df_movies[comedy_movies].head()

In [19]:
movie_data_ratings_data=df_movies.merge(df_ratings, left_on = "movie" , right_on = 'item', how = 'inner') #on= "movieId"
# movie_data_ratings_data.head(3)

In [20]:
high_rated= movie_data_ratings_data['rate']>4.0
# movie_data_ratings_data[high_rated].head(10)


In [21]:
most_rated = movie_data_ratings_data.groupby('title').size().sort_values(ascending=False)[:25]
# most_rated.head(25)

In [22]:
# df_movies[['title','genres']].head()
# get the year of the movies: 
df_movies['year'] =df_movies['title'].str.extract('.*\((.*)\).*',expand = False)
# df_movies.head(5)



In [26]:
# count how many times each genre appear: 
def count_word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
        for s in liste_keywords: 
            if pd.notnull(s): keyword_count[s] += 1
    # convert the dictionary in a list to sort the keywords  by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [27]:
genre_labels = set()
for s in df_movies['tags'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

In [28]:
keyword_occurences, dum = count_word(df_movies, 'tags', genre_labels)
keyword_occurences

[['Drama', 1603],
 ['Comedy', 1200],
 ['Action', 503],
 ['Thriller', 492],
 ['Romance', 471],
 ['Horror', 343],
 ['Adventure', 283],
 ['Sci-Fi', 276],
 ["Children's", 251],
 ['Crime', 211],
 ['War', 143],
 ['Documentary', 127],
 ['Musical', 114],
 ['Mystery', 106],
 ['Animation', 105],
 ['Western', 68],
 ['Fantasy', 68],
 ['Film-Noir', 44]]

In [29]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)


#Finally, the result is shown as a wordcloud:
words = dict()
trunc_occurences = keyword_occurences[0:50]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 100 # define the color of the words
f, ax = plt.subplots(figsize=(14, 6))
# wordcloud = WordCloud(width=550,height=300, background_color='black', 
#                       max_words=1628,relative_scaling=0.7,
#                       color_func = random_color_func,
#                       normalize_plurals=False)
# wordcloud.generate_from_frequencies(words)
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis('off')
# plt.show()

In [30]:
fig = plt.figure(1, figsize=(18,13))
ax2 = fig.add_subplot(2,1,2)
y_axis = [i[1] for i in trunc_occurences]
x_axis = [k for k,i in enumerate(trunc_occurences)]
x_label = [i[0] for i in trunc_occurences]
plt.xticks(rotation=85, fontsize = 15)
plt.yticks(fontsize = 15)
plt.xticks(x_axis, x_label)
plt.ylabel("No. of occurences", fontsize = 24, labelpad = 0)
ax2.bar(x_axis, y_axis, align = 'center', color='r')
plt.title("Popularity of Genres",bbox={'facecolor':'k', 'pad':5},color='w',fontsize = 30)
# plt.show()

<matplotlib.text.Text at 0x76dcf60>

# Network + train + test

In [14]:
tf.reset_default_graph()
samples_per_batch = len(df_train) // BATCH_SIZE
print("Samples per batch = {}".format(samples_per_batch))
iter_train = ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                    df_train["rate"]],
                                    batch_size=BATCH_SIZE)

iter_test = OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                    df_test["rate"]],
                                    batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM, device=DEVICE)
global_step = tf.contrib.framework.get_or_create_global_step()
_, train_op = optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)
#zeros= tf.Variable(tf.zeros([1]),name="zeros")


Samples per batch = 900


In [15]:

def svd(train, test,length,moviefile, trainFl=False):
    init_op = tf.global_variables_initializer()
    saver=tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init_op)
        if trainFl == True: 
            summary_writer = tf.summary.FileWriter(logdir="./tmp/svd/log", graph=sess.graph)
            print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
            errors = deque(maxlen=samples_per_batch)
            start = time.time()
            for i in range(EPOCH_MAX * samples_per_batch):
                users, items, rates = next(iter_train)
                _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                    item_batch: items,
                                                                                                                        rate_batch: rates})
                pred_batch = clip(pred_batch)
                errors.append(np.power(pred_batch - rates, 2))
                if i % samples_per_batch == 0:
                    train_err = np.sqrt(np.mean(errors))
                    test_err2 = np.array([])
                    for users, items, rates in iter_test:
                        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                                item_batch: items})
                        pred_batch = clip(pred_batch)
                        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                    end = time.time()
                    test_err = np.sqrt(np.mean(test_err2))
                    print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err,
                                                        end - start))
                    train_err_summary = make_scalar_summary("training_error", train_err)
                    test_err_summary = make_scalar_summary("test_error", test_err)
                    summary_writer.add_summary(train_err_summary, i)
                    summary_writer.add_summary(test_err_summary, i)
                    start = end

            #meta_graph_def = tf.train.export_meta_graph(filename='/tmp/tfrecomm.meta')
            save_path=saver.save(sess,"./tmp/")
        else: 
            print("model restored")
            saver.restore(sess, "./tmp/")

# Network + train + test v2

In [None]:
tf.reset_default_graph()
samples_per_batch = len(df_train) // BATCH_SIZE
print("Samples per batch = {}".format(samples_per_batch))
iter_train = ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                    df_train["rate"]],
                                    batch_size=BATCH_SIZE)

iter_test = OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                    df_test["rate"]],
                                    batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM, device=DEVICE)
global_step = tf.contrib.framework.get_or_create_global_step()
_, train_op = optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)
#zeros= tf.Variable(tf.zeros([1]),name="zeros")


In [None]:

def svd2(train, test,length,moviefile, trainFl=False):
    init_op = tf.global_variables_initializer()
    saver=tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init_op)
        if trainFl == True: 
            summary_writer = tf.summary.FileWriter(logdir="./tmp/svd/log", graph=sess.graph)
            print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
            errors = deque(maxlen=samples_per_batch)
            start = time.time()
            for i in range(EPOCH_MAX * samples_per_batch):
                users, items, rates = next(iter_train)
                _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                    item_batch: items,
                                                                                                                        rate_batch: rates})
                pred_batch = clip(pred_batch)
                errors.append(np.power(pred_batch - rates, 2))
                if i % samples_per_batch == 0:
                    train_err = np.sqrt(np.mean(errors))
                    test_err2 = np.array([])
                    for users, items, rates in iter_test:
                        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                                item_batch: items})
                        pred_batch = clip(pred_batch)
                        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                    end = time.time()
                    test_err = np.sqrt(np.mean(test_err2))
                    print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err,
                                                        end - start))
                    train_err_summary = make_scalar_summary("training_error", train_err)
                    test_err_summary = make_scalar_summary("test_error", test_err)
                    summary_writer.add_summary(train_err_summary, i)
                    summary_writer.add_summary(test_err_summary, i)
                    start = end

            #meta_graph_def = tf.train.export_meta_graph(filename='/tmp/tfrecomm.meta')
            save_path=saver.save(sess,"./tmp/")
        else: 
            print("model restored")
            saver.restore(sess, "./tmp/")

In [16]:
svd(df_train, df_test, length,df_movies, trainFl=False) 
print("Done!")

model restored
INFO:tensorflow:Restoring parameters from ./tmp/
Done!


# EXECUTION

In [14]:
def printMM(topmovies):
    print (topmovies)
    print("TOP Movies")
    for i,r in topmovies:
        print("{0:5} - {1:1.2f} - {2}" .format(i,  r, df_movies.iloc[i].title  ))

In [15]:
def test(train, test,length,moviefile, data, trainFl=False):
    init_op = tf.global_variables_initializer()
    #saver=tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init_op)
        #saver.restore(sess, "./tmp/")

        movies=list(range(len(moviefile)))
        #print (movies)
        users=[1]
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)

        print ("\nTop Movies ------------------------------------------------------------")
        topmovies= smovies[0:10]
        #print (topmovies)
        printMM(topmovies)
        
        #-----------------------------------------------------------------------------
        
        print ("\n User - data {} ------------------------------------------------------------\n" .format(data))
        # give number between 1 - 5000
        del users[:]
        users.append(int(data))
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)
        topmovies= smovies[0:10]
        printMM(topmovies)
        for item in topmovies:
            itopmovie=item[0]
            recommendedmovie=moviefile["title"][itopmovie]
            recommendedtags=moviefile["tags"][itopmovie]
    return

In [16]:
test(df_train, df_test, length,df_movies, data=2, trainFl=False) 


Top Movies ------------------------------------------------------------
[(3005, 0.13799244), (1592, 0.13783297), (2703, 0.13691211), (2854, 0.13683367), (2291, 0.13662498), (176, 0.13657382), (1287, 0.13641189), (1208, 0.13638708), (2941, 0.13635924), (75, 0.13625325)]
TOP Movies
 3005 - 0.14 - Jeremiah Johnson (1972)
 1592 - 0.14 - Stag (1997)
 2703 - 0.14 - Detroit Rock City (1999)
 2854 - 0.14 - Citizen's Band (a.k.a. Handle with Care) (1977)
 2291 - 0.14 - Celebration, The (Festen) (1998)
  176 - 0.14 - Love & Human Remains (1993)
 1287 - 0.14 - When Harry Met Sally... (1989)
 1208 - 0.14 - Quiet Man, The (1952)
 2941 - 0.14 - Rosetta (1999)
   75 - 0.14 - Screamers (1995)

 User - data 2 ------------------------------------------------------------

[(2658, 0.13912864), (1461, 0.13912794), (3758, 0.13898048), (2999, 0.13883805), (3534, 0.13878934), (3215, 0.13874154), (2854, 0.13854349), (3228, 0.13849339), (547, 0.13841943), (1131, 0.13836853)]
TOP Movies
 2658 - 0.14 - Killer's 

# NEXT = SHOW EMBEDDING GRAPH

# TO DO: 
* Read Users 
* Compare results for similar users 
* Show statistics


## Users Information
- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:
	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [15]:
import pandas as pd

In [16]:
def get_users():
    col_names = ["userID", "gender", "age", "occupation", "zipcode"]
    df = pd.read_csv("./tmp/movielens/ml-1m/users.dat", sep="::", header=None, names=col_names, engine='python')
    rows = len(df)
    return df, rows


In [17]:
df_users,rows = get_users()


In [18]:
print(len(df_users))
df_users.describe()

6040


Unnamed: 0,userID,age,occupation
count,6040.0,6040.0,6040.0
mean,3020.5,30.639238,8.146854
std,1743.742145,12.895962,6.329511
min,1.0,1.0,0.0
25%,1510.75,25.0,3.0
50%,3020.5,25.0,7.0
75%,4530.25,35.0,14.0
max,6040.0,56.0,20.0


In [19]:
df_users.iloc[0:10]

Unnamed: 0,userID,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


## FIND similar users 


In [20]:
def get_users_where(age="*", gender="*", occupation="*", zipcode="*"): 
    us = df_users
    if age != "*":
        us = us[us["age"]==age]
    if gender != "*":
        us = us[us["gender"]==gender]
    if occupation != "*":
        us = us[us["occupation"]==occupation]
    if zipcode != "*":
        us = us[us["zipcode"]==zipcode]    
    return us # return the data 
#     return len(us) return the length of each selection

# print(get_users_where(age=1) )
# print(get_users_where(age=1, gender ="M") )
print(get_users_where(age=1, gender ="M", occupation = 1 ) )
# print(get_users_where(age=1, gender ="M", occupation = 1, zipcode = 48067 ) )


      userID gender  age  occupation zipcode
1814    1815      M    1           1   30707
2059    2060      M    1           1   48304
2852    2853      M    1           1  444555


# Compare results - Show statistics

In [22]:
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init_op)
    #saver.restore(sess, "./tmp/")
    movies=list(range(len(df_movies)))

    user_IDEN = get_users_where(age=1, gender ="M", occupation = 1 )
    print ("\nTop Movies ------------------------------------------------------------")

    for i in range(len(user_IDEN)):
        userID = user_IDEN.iloc[i]["userID"]
        print(userID)
        users=[userID]
        pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: movies})
        moviesrecomm=list(zip(movies,pred_batch))
        smovies=sorted (moviesrecomm,key=lambda x:x[1],reverse=True)
        topmovies= smovies[0:10]
        printMM(topmovies)


Top Movies ------------------------------------------------------------
1815
[(3154, -1.2711122), (542, -1.2713907), (1784, -1.2718444), (2258, -1.2719319), (1000, -1.2721136), (3471, -1.2722907), (2529, -1.2724241), (3009, -1.2725339), (1840, -1.2727556), (1460, -1.2728013)]
TOP Movies
 3154 - -1.27 - Zed & Two Noughts, A (1985)
  542 - -1.27 - Super Mario Bros. (1993)
 1784 - -1.27 - Alan Smithee Film: Burn Hollywood Burn, An (1997)
 2258 - -1.27 - Tales from the Darkside: The Movie (1990)
 1000 - -1.27 - Parent Trap, The (1961)
 3471 - -1.27 - Passion of Mind (1999)
 2529 - -1.27 - Pushing Tin (1999)
 3009 - -1.27 - Liberty Heights (1999)
 1840 - -1.27 - X-Files: Fight the Future, The (1998)
 1460 - -1.27 - B*A*P*S (1997)
2060
[(3070, -1.3068042), (3471, -1.3079368), (1512, -1.3079507), (3552, -1.3081417), (2711, -1.3082843), (408, -1.3083031), (734, -1.308417), (542, -1.3084476), (847, -1.3085543), (472, -1.3087789)]
TOP Movies
 3070 - -1.31 - Tarzan the Fearless (1933)
 3471 - -1

# movies watched by users... 

In [34]:
def getAg(age):
    if age == 1:    return "Under 18"
    elif age == 18: return "18 - 24 "
    elif age == 25: return "25 - 34 "
    elif age == 35: return "35 - 44 "
    elif age == 45: return "45 - 49 "
    elif age == 50: return "50 - 55 "
    elif age == 56: return "Above 56"

def getOc(occ):
    if occ == 0:    return "other"
    elif occ == 1:  return "academic/educator"
    elif occ == 2:  return "artist"
    elif occ == 3:  return "clerical/admin"
    elif occ == 4:  return "college/grad student"
    elif occ == 5:  return "customer service"
    elif occ == 6:  return "doctor/health care"
    elif occ == 7:  return "executive/managerial"
    elif occ == 8:  return "farmer"
    elif occ == 9:  return "homemaker"
    elif occ == 10: return "K-12 student"
    elif occ == 11: return "lawyer"
    elif occ == 12: "programmer"
    elif occ == 13: "retired"
    elif occ == 14: "sales/marketing"
    elif occ == 15: "scientist"
    elif occ == 16: "self-employed"
    elif occ == 17: "technician/engineer"
    elif occ == 18: "tradesman/craftsman"
    elif occ == 19: "unemployed"
    elif occ == 20: "writer"
        
userID = 1.
user = df_users[df_users["userID"]==userID]

print("id: {}, G: {}, age {}, oc: {}, zc: {}" 
      .format(user["userID"][0], user["gender"][0], getAg(user["age"][0]), user["occupation"][0], user["zipcode"][0] ))
# for i,r in topmovies:
#     print("{0:5} - {1:1.2f} - {2}" .format(i,  r, df_movies.iloc[i].title  ))


id: 1, G: F, age Under 18, oc: 10, zc: 48067


1