<a href="https://colab.research.google.com/github/codesongs/codestates_TP2/blob/main/DeepFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
dataset_path = 'C:/Users/ktj45/Downloads/project2/ds-sa-recommendation-main/datasets'

In [None]:
os.listdir(dataset_path)
def load_ratings():
    COL_NAME = ['userId','movieId','rating','timestamp']
    df = pd.read_csv(os.path.join(dataset_path,"ratings.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies():
    COL_NAME = ['movieId','title','genres']
    df = pd.read_csv(os.path.join(dataset_path,"movies.dat"), sep='::', header=None, engine='python', names=COL_NAME, encoding='ISO-8859-1')
    return df

def load_users():
    COL_NAME = ['userId','gender','age','Occupation','zip_code']
    df = pd.read_csv(os.path.join(dataset_path,"users.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

In [None]:
ratings_df = load_ratings()
movies_df = load_movies()
users_df = load_users()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# ratings_df['timestamp_scaled'] = scaler.fit_transform(ratings_df[['timestamp']])
ratings_df.drop("timestamp",axis=1 , inplace = True)

In [None]:
genres_df = movies_df.genres.str.get_dummies(sep =  "|")
movies_df.drop("genres",axis=1 , inplace = True)
movies_df = pd.concat([movies_df,genres_df], axis= 1 )
movies_df["year"] = movies_df.title.str.extract("(\(\d\d\d\d\))")
movies_df.year = movies_df.year.apply(lambda x : x.replace("(","").replace(")",""))
movies_df.year = movies_df.year.astype("int32")
movies_df.drop("title",axis = 1 ,inplace = True)
bins = list(range(1980,movies_df.year.max()+1,5))
bins.append(0)
bins = sorted(bins)
labels = list(range(len(bins)-1))
labels = ["year_" + str(i) for i in labels]
movies_df.year = pd.cut(movies_df['year'],bins = bins, right= True,labels = labels)
year_df = pd.get_dummies(movies_df.year)
movies_df = pd.concat([movies_df,year_df], axis= 1)
movies_df.drop("year",axis=1 , inplace = True)

In [None]:
genders_df = pd.get_dummies(users_df.gender, prefix = "gender")
users_df = pd.concat([users_df, genders_df],axis=1)
users_df.drop("gender",axis= 1,inplace=True)
ages_df = pd.get_dummies(users_df.age.astype(int))
ages_df.columns = ["Under 18","18-24", "25-34","35-44","45-49", "50-55","56+"]
users_df  = pd.concat([users_df, ages_df], axis = 1)
users_df.drop("age",axis=1,inplace = True)
users_df.drop("Occupation",axis=1,inplace = True)

In [None]:
pip install --upgrade uszipcode

Collecting uszipcode
  Using cached uszipcode-1.0.1-py2.py3-none-any.whl (35 kB)
Collecting attrs (from uszipcode)
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
     ---------------------------------------- 0.0/61.2 kB ? eta -:--:--
     ------------------- ------------------ 30.7/61.2 kB 445.2 kB/s eta 0:00:01
     -------------------------------------- 61.2/61.2 kB 652.8 kB/s eta 0:00:00
Collecting pathlib-mate (from uszipcode)
  Using cached pathlib_mate-1.2.1-py2.py3-none-any.whl (121 kB)
Collecting atomicwrites (from uszipcode)
  Downloading atomicwrites-1.4.1.tar.gz (14 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting fuzzywuzzy (from uszipcode)
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting haversine>=2.5.0 (from uszipcode)
  Using cached haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Collecting SQLAlchemy>=1.4.0 (from uszipcode)
  Obtaining dependency information for SQLAlchemy>=1

In [None]:
from uszipcode import SearchEngine
search = SearchEngine()
#zipcode dictionary example
search.by_zipcode(48067).items()



[('zipcode', '48067'),
 ('zipcode_type', 'STANDARD'),
 ('major_city', 'Royal Oak'),
 ('post_office_city', 'Royal Oak, MI'),
 ('common_city_list', ['Royal Oak']),
 ('county', 'Oakland County'),
 ('state', 'MI'),
 ('lat', 42.5),
 ('lng', -83.15),
 ('timezone', 'America/Detroit'),
 ('radius_in_miles', 2.0),
 ('area_code_list', '248,313'),
 ('population', 24458),
 ('population_density', 5328.0),
 ('land_area_in_sqmi', 4.59),
 ('water_area_in_sqmi', 0.0),
 ('housing_units', 13014),
 ('occupied_housing_units', 12146),
 ('median_home_value', 159500),
 ('median_household_income', 66659),
 ('bounds_west', -83.172176),
 ('bounds_east', -83.110736),
 ('bounds_north', 42.504651),
 ('bounds_south', 42.474169)]

In [None]:
def get_median_household_income(zip_code):
    result = search.by_zipcode(zip_code)
    if result is not None:
        return result.to_dict()["median_household_income"]
    else:
        return None

users_df["median_household_income"] = users_df["zip_code"].apply(get_median_household_income)

In [None]:
users_df.median_household_income = users_df.median_household_income.fillna(users_df.median_household_income.mean())
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

users_df.median_household_income = scaler.fit_transform(users_df[["median_household_income"]])
#drop zipcode  -- > too sparse
users_df.drop("zip_code",axis=1 , inplace = True)

In [None]:
ratings_df = ratings_df.merge(users_df, how = "left")
ratings_df = ratings_df.merge(movies_df, how = "left")

In [None]:
for col in ratings_df.columns:
    if ratings_df[col].dtype == bool:
        ratings_df[col] = ratings_df[col].astype(int)

In [None]:
target = ratings_df["rating"]
ratings_df.drop("rating",inplace=True,axis=1)

In [None]:
ratings_df

Unnamed: 0,userId,movieId,gender_F,gender_M,Under 18,18-24,25-34,35-44,45-49,50-55,...,Romance,Sci-Fi,Thriller,War,Western,year_0,year_1,year_2,year_3,year_4
0,1,1193,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,661,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,914,1,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,1,3408,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,2355,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000205,6040,1094,0,1,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1000206,6040,562,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1000207,6040,1096,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [None]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 35 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   userId                   1000209 non-null  int64  
 1   movieId                  1000209 non-null  int64  
 2   gender_F                 1000209 non-null  int32  
 3   gender_M                 1000209 non-null  int32  
 4   Under 18                 1000209 non-null  int32  
 5   18-24                    1000209 non-null  int32  
 6   25-34                    1000209 non-null  int32  
 7   35-44                    1000209 non-null  int32  
 8   45-49                    1000209 non-null  int32  
 9   50-55                    1000209 non-null  int32  
 10  56+                      1000209 non-null  int32  
 11  median_household_income  1000209 non-null  float64
 12  Action                   1000209 non-null  int64  
 13  Adventure                1000209 non-null 

In [None]:
binary_target = target>=4.0

In [None]:
continuous_field_name =  {"median_household_income" : ["median_household_income"]}
categorical_field_name = {"userId": ["userId"],
                          "movieId": ["movieId"],
                          "gender": list(genders_df.columns),
                          "age" :list(ages_df.columns),
                          "genres":list(genres_df.columns),
                          "year" : list(year_df.columns)}

all_field_name = list(continuous_field_name.keys()) + list(categorical_field_name.keys())


In [None]:

from itertools import repeat
def make_embedding_lookup(all_field_name,continuous_field_name,categorical_field_name):
    embbeding_lookup_index = []

    for index,field in enumerate(all_field_name):
        if field in continuous_field_name.keys():
            embbeding_lookup_index.extend([index])
        if field in categorical_field_name.keys():
            embbeding_lookup_index.extend(repeat(index,len(categorical_field_name[field])))


    return embbeding_lookup_index

In [None]:
class wide_part(keras.layers.Layer):
    def __init__(self, V, num_fields, embedding_lookup_index, **kwargs):
        super(wide_part, self).__init__(**kwargs)
        self.V = V
        self.num_fields = num_fields
        self.embedding_lookup_index = embedding_lookup_index

    def build(self,batch_input_size):
            w_init = tf.random_normal_initializer()

            self.W=  tf.Variable(initial_value = w_init(shape = [batch_input_size[-1]]),
                                dtype = 'float32')
            self.V = tf.Variable(initial_value = w_init(shape = [self.num_fields,self.V]),
                                 dtype = "float32")


    def call(self,inputs):
        #(None,108)
        x_batch = keras.layers.Reshape((inputs.shape[-1],1))(inputs)
        #(None,108,1)
        embeddings_lookup_table = tf.nn.embedding_lookup(params = self.V, ids = self.embedding_lookup_index)
        #(108,V) --> embedding_lookup_table

        embedded_fields =  tf.math.multiply(x_batch, embeddings_lookup_table)
        #element-wise after broadcasting to (None,108,1) --> (None,108,V)

        order_1_output = tf.reduce_sum(tf.math.multiply(inputs,self.W),axis=1)
#         elementwise after broadcasting (None,108) x (108) = None,108
#         reduce_sum == (None,)


        embed_sum = tf.reduce_sum(embedded_fields, [1,2])
        #(None,108,V) == > (None,)
        embed_square = tf.square(embedded_fields)
        #(None,108,V) ==> (None,108,V)
        square_of_sum = tf.square(embed_sum)
        #(None,) == > (None,)
        sum_of_square = tf.reduce_sum(embed_square,[1,2])
        #(None,108,V) == > (None, )
        order_2_output = 0.5 * tf.subtract(square_of_sum,sum_of_square)
        #(None,) ==> (None,)
        order_1_output=keras.layers.Reshape((-1,1))(order_1_output)
        #(None,) ==> (None,1,1)
        order_2_output=keras.layers.Reshape((-1,1))(order_2_output)
        #(None,) ==> (None,1,1)
        wide_output = keras.layers.Concatenate(axis=1)([order_1_output,order_2_output])
#         print(order_1_output.shape)
#         print(order_2_output.shape)
        #(None,2,1)


        linear_terms = tf.reduce_sum(
            tf.math.multiply(self.W, inputs), axis=1, keepdims=False)

        # (batch_size, )
        interactions = 0.5 * tf.subtract(
            tf.square(tf.reduce_sum(embedded_fields, [1, 2])),
            tf.reduce_sum(tf.square(embedded_fields), [1, 2])
        )

        linear_terms = tf.reshape(linear_terms, [-1, 1])
        interactions = tf.reshape(interactions, [-1, 1])

        wide_output = tf.concat([linear_terms, interactions], 1)

        return wide_output,embedded_fields

In [None]:
class deep_part(keras.layers.Layer):
    def __init__(self,layer_list = [256,256,256],dropout_rate = 0.2,activation = "relu",**kwargs):
        super(deep_part, self).__init__(**kwargs)
        self.activaiton_fn = keras.activations.get(activation)
        self.dropout_rate = dropout_rate
        self.dense_layer_list = [keras.layers.Dense(num_neuron, activation = self.activaiton_fn) for num_neuron in layer_list]
        self.output_layer  = keras.layers.Dense(1, activation  = "relu")

    def call(self,inputs):
        embed_2d = inputs
        #(None,108,V)
        embed_2d = keras.layers.Flatten(name = 'flat_embed')(embed_2d)
        #(None,108 * V)
        result = embed_2d
        for layer in self.dense_layer_list:
            result = keras.layers.Dropout(self.dropout_rate)(result)
            result = layer(result)

        deep_result = self.output_layer(result)
        return deep_result

In [None]:
# class deep_FM(keras.Model):
#     def __init__(self,V,num_fields,embedding_lookup_index,layer_list =[256,256,256],dropout_rate = 0.2,activation = "relu",**kwargs):
#         super(deep_FM, self).__init__(**kwargs)
#         self.wide_part = wide_part(V,num_fields,embedding_lookup_index)
#         self.deep_part = deep_part(layer_list,dropout_rate,activation)
#         self.output_layer = keras.layers.Dense(1,activation = "sigmoid")


#     def call(self,inputs):
#         #inputs = (None,108)
#         wide_output , embeddings = self.wide_part(inputs)
#         deep_output  = self.deep_part(embeddings)

#         concat = keras.layers.Concatenate(axis = 1)([wide_output,deep_output])
#         wide_deep_output = self.output_layer(concat)
#         return wide_deep_output

In [None]:
class deep_FM(keras.Model):
    def __init__(self,V,num_fields,embedding_lookup_index,layer_list =[256,256,256],dropout_rate = 0.2,activation = "relu",**kwargs):
        super(deep_FM, self).__init__(**kwargs)
        self.wide_part = wide_part(V,num_fields,embedding_lookup_index)
        self.deep_part = deep_part(layer_list,dropout_rate,activation)
        self.output_layer = keras.layers.Dense(1)


    def call(self,inputs):
        #inputs = (None,108)
        wide_output , embeddings = self.wide_part(inputs)
        deep_output  = self.deep_part(embeddings)

        concat = keras.layers.Concatenate(axis = 1)([wide_output,deep_output])
        wide_deep_output = self.output_layer(concat)
        return wide_deep_output

In [None]:
embedding_lookup_index =  make_embedding_lookup(all_field_name,continuous_field_name,categorical_field_name)
model = deep_FM(16, 7, embedding_lookup_index)

In [None]:
# model.compile(loss = keras.losses.binary_crossentropy,
#              optimizer =  keras.optimizers.Adam(0.001),
#              metrics = [keras.metrics.AUC(),keras.metrics.BinaryAccuracy()],
#              )
# binary_target = binary_target.astype("float32")

In [None]:
# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer=keras.optimizers.Adam(0.001),
#               metrics=['accuracy'])

In [None]:

model.compile(loss=keras.losses.mean_squared_error,
              optimizer=keras.optimizers.Adam(0.001),
              metrics=[keras.metrics.RootMeanSquaredError()])
reg_target = target.astype("float32")

In [None]:
X_train,X_test,y_train,y_test = train_test_split(ratings_df,binary_target,test_size= 0.1, random_state =42)

In [None]:
from datetime import datetime
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
history2 = model.fit(X_train,y_train,
                   epochs = 30,
                   batch_size= 32,
                   validation_data=(X_test,y_test),
                   callbacks=[tensorboard_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
def ndcg_at_k(rating_true, rating_pred):
    # Sort predicted ratings in descending order
    rating_pred['rel'] = 0
    matching_indices = rating_pred['movieId'].isin(rating_true['movieId'])
    rating_pred.loc[matching_indices, 'rel'] = 1
    rel_list = rating_pred['rel'].tolist()
    # Initialize variables for DCG and IDCG
    dcg = 0.0
    idcg = 0.0
    rel_count = rel_list.count(1)
    float_list = [float(val) for val in rel_list]
    # Calculate DCG and IDCG for each position
    for i in range(len(rel_list)):
        if rel_count == 0:
            continue  # Skip if rel is empty
        else:
            dcg += float_list[i] / np.log2(i + 2)
            idcg += 1 / np.log2(i + 2)
    # Calculate nDCG
    if idcg == 0.0:
        return 0.0
    ndcg = dcg / idcg
    return ndcg

In [None]:
ratings_df2 = load_ratings()
ratings_df2.drop("timestamp",axis=1 , inplace = True)

In [None]:
def process_user(user_id):
    user_id_data = users_df[users_df['userId'] == user_id]
    num_movies = len(movies_df)
    user_id_extended = pd.concat([user_id_data] * num_movies, ignore_index=True)
    merged_df = pd.concat([movies_df, user_id_extended], axis=1)
    merged_df = merged_df[ratings_df.columns]
    for col in merged_df.columns:
        if merged_df[col].dtype == bool:
            merged_df[col] = merged_df[col].astype(int)
    y_pred = model.predict(merged_df)

    user_rec = pd.DataFrame()
    user_rec["userId"] = merged_df["userId"]
    user_rec["movieId"] = merged_df["movieId"]
    user_rec["pred"] = y_pred

    user_rec = user_rec.nlargest(20, "pred")

    user_ratings = ratings_df2[ratings_df2['userId'] == user_id]
    user_ratings = user_ratings.query('rating == 5 or rating == 4')

    matching_indices = user_ratings["movieId"].isin(user_rec["movieId"])
    filtered_ratings = user_ratings[matching_indices]

    # Calculate Recall
    num_relevant_items = len(filtered_ratings)
    recall = num_relevant_items / 20

    # Calculate nDCG
    ndcg = ndcg_at_k(filtered_ratings, user_rec)

    return recall, ndcg


In [None]:
process_user(5)



(0.05, 0.036356287988538835)

In [None]:
# def process_user(user_id):
#     user_id_data = users_df[users_df['userId'] == user_id]
#     num_movies = len(movies_df)
#     user_id_extended = pd.concat([user_id_data] * num_movies, ignore_index=True)
#     merged_df = pd.concat([movies_df, user_id_extended], axis=1)
#     merged_df = merged_df[ratings_df.columns]
#     for col in merged_df.columns:
#         if merged_df[col].dtype == bool:
#             merged_df[col] = merged_df[col].astype(int)
#     y_pred = model.predict(merged_df)

#     user_rec = pd.DataFrame()
#     user_rec["userId"] = merged_df["userId"]
#     user_rec["movieId"] = merged_df["movieId"]
#     user_rec["pred"] = y_pred

#     user_rec = user_rec.nlargest(20, "pred")

#     user_ratings = ratings_df2[ratings_df2['userId'] == user_id]
#     user_ratings = user_ratings.nlargest(20, "rating")

#     matching_indices = user_ratings["movieId"].isin(user_rec["movieId"])
#     filtered_ratings = user_ratings[matching_indices]

#     return filtered_ratings


In [None]:
recall_dict = {}
ndcg_dict = {}

# Iterate through each user
for user_id in users_df['userId']:
    # Calculate recall and ndcg using the process_user function
    recall, ndcg = process_user(user_id)
    recall_dict[user_id] = recall
    ndcg_dict[user_id] = ndcg




In [None]:

# Calculate the average recall and ndcg values across all users
avg_recall = sum(recall_dict.values()) / 6040
avg_ndcg = sum(ndcg_dict.values()) / 6040

print("Average Recall@20:", avg_recall)
print("Average nDCG@20:", avg_ndcg)

Average Recall@20: 0.047723509933775964
Average nDCG@20: 0.046413731494313076


In [None]:
recall20 = 3612/(6040*20)
print(recall20)

0.02990066225165563
