In [1]:
c = '00000001'
print(int(str(c), 2))

c = '11111111'
print(int(str(c), 2))




# 1	0	0	0	1	1	0	0	0	1	1	1
minimo = '000100010001'
maximo = '100011111111'
print(int(str(minimo), 2))
print(int(str(maximo), 2))

c = '001000000010'
print(int(str(c), 2))

1
255
8
273
2303
514


In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MaxNLocator
from learntools.time_series.utils import plot_periodogram, seasonal_plot
from learntools.time_series.style import *
import seaborn as sns
from IPython.display import Markdown, display
from pathlib import Path
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Function to display markdown
def printmd(string):
    display(Markdown(string))

# Define directory and read data
comp_dir = Path('../input/amazon-product-reviews')
electronics_data = pd.read_csv(
    comp_dir / "ratings_Electronics (1).csv",
    dtype={'rating': 'int8'},
    names=['userId', 'productId', 'rating', 'timestamp'],
    index_col=None,
    header=0
)

# Display some basic information
printmd(f"Number of Rating: {electronics_data.shape[0]:,}")
printmd(f"Columns: {np.array2string(electronics_data.columns.values)}")
printmd(f"Number of Users: {len(electronics_data.userId.unique()):,}")
printmd(f"Number of Products: {len(electronics_data.productId.unique()):,}")
electronics_data.describe()['rating'].reset_index()

# Check for missing values
printmd('**Number of missing values**:')
pd.DataFrame(
    electronics_data.isnull().sum().reset_index()
).rename(columns={0: "Total missing", "index": "Columns"})

# Process data by date
data_by_date = electronics_data.copy()
data_by_date.timestamp = pd.to_datetime(electronics_data.timestamp, unit="s")
data_by_date = data_by_date.sort_values(by="timestamp", ascending=False).reset_index(drop=True)

printmd("Number of Ratings each day:")
data_by_date.groupby("timestamp")["rating"].count().tail(10).reset_index()

# Add year and month columns
data_by_date["year"] = data_by_date.timestamp.dt.year
data_by_date["month"] = data_by_date.timestamp.dt.month
rating_by_year = data_by_date.groupby(["year", "month"])["rating"].count().reset_index()

# Create date column and plot data
rating_by_year["date"] = pd.to_datetime(rating_by_year["year"].astype(str) + "-" + rating_by_year["month"].astype(str) + "-1")
rating_by_year.plot(x="date", y="rating")
plt.title("Number of Ratings over Years")
plt.show()

# Group by product and calculate statistics
rating_by_product = electronics_data.groupby("productId").agg({
    "userId": "count",
    "rating": "mean"
}).rename(columns={"userId": "Number of Ratings", "rating": "Average Rating"}).reset_index()

# Filter top-rated products
cutoff = 50
top_rated = rating_by_product.loc[
    rating_by_product["Number of Ratings"] > cutoff
].sort_values(by="Average Rating", ascending=False).reset_index(drop=True)

# Define TensorFlow model
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Embedding layers for users and products
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=unique_userIds, mask_token=None),
            tf.keras.layers.Embedding(len(unique_userIds) + 1, embedding_dimension)
        ])

        self.product_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=unique_productIds, mask_token=None),
            tf.keras.layers.Embedding(len(unique_productIds) + 1, embedding_dimension)
        ])

        # Ratings layers
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])

    def call(self, userId, productId):
        user_embeddings = self.user_embeddings(userId)
        product_embeddings = self.product_embeddings(productId)
        return self.ratings(tf.concat([user_embeddings, product_embeddings], axis=1))

class amazonModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model = RankingModel()
        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["userId"], features["productId"])
        return self.task(labels=features["rating"], predictions=rating_predictions)

# Filter recent data
cutoff_no_rat = 50
cutoff_year = 2011
recent_data = data_by_date.loc[data_by_date["year"] > cutoff_year]
print(f"Number of Rating: {recent_data.shape[0]:,}")
print(f"Number of Users: {len(recent_data.userId.unique()):,}")
print(f"Number of Products: {len(recent_data.productId.unique()):,}")
del data_by_date  # Free up memory

recent_prod = recent_data.loc[
    recent_data.groupby("productId")["rating"].transform('count').ge(cutoff_no_rat)
].reset_index(drop=True).drop(["timestamp", "year", "month"], axis=1)
del recent_data  # Free up memory

# Prepare data for training
userIds = recent_prod.userId.unique()
productIds = recent_prod.productId.unique()
total_ratings = len(recent_prod.index)

ratings = tf.data.Dataset.from_tensor_slices({
    "userId": tf.cast(recent_prod.userId.values, tf.string),
    "productId": tf.cast(recent_prod.productId.values, tf.string),
    "rating": tf.cast(recent_prod.rating.values, tf.int8)
})

# Shuffle and split data
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))

unique_productIds = productIds
unique_userIds = userIds

# Compile and train model
model = amazonModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=10)

# Evaluate model
model.evaluate(cached_test, return_dict=True)

# Recommend products for a random user
user_rand = userIds[123]
test_rating = {}
for m in test.take(5):
    test_rating[m["productId"].numpy()] = RankingModel()(
        tf.convert_to_tensor([user_rand]), tf.convert_to_tensor([m["productId"]])
    )

print(f"Top 5 recommended products for User {user_rand}:")
for m in sorted(test_rating, key=test_rating.get, reverse=True):
    print(m.decode())





Los sistemas de calificación ponderada se utilizan para puntuar la calificación de cada película. Esta es la fórmula de la puntuación ponderada.
WR = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C
R es la puntuación media del artículo.
v es el número de votos del artículo.
m es el mínimo de votos necesarios para figurar en los artículos populares (definido por > percentil 80 del total de votos).
C es la valoración media de todo el conjunto de datos.



Vemos que cada método tiene su punto fuerte. Lo mejor sería poder combinar todos esos puntos fuertes y ofrecer una recomendación mejor. Esta idea nos lleva a otra mejora de la recomendación, que es el método híbrido. Por ejemplo, podemos combinar las recomendaciones de filtrado colaborativo basadas en el contenido y en los elementos para aprovechar las características de ambos dominios (géneros e interacción usuario-elemento).

Traducción realizada con la versión gratuita del traductor DeepL.com

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.



electronics_data=pd.read_csv("/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv",names=['userId', 'productId','Rating','timestamp'])


# Display the data

electronics_data.head()

#Shape of the data
electronics_data.shape

#Taking subset of the dataset
electronics_data=electronics_data.iloc[:1048576,0:]

#Check the datatypes
electronics_data.dtypes


electronics_data.info()



#Five point summary 

electronics_data.describe()['Rating'].T


#Find the minimum and maximum ratings
print('Minimum rating is: %d' %(electronics_data.Rating.min()))
print('Maximum rating is: %d' %(electronics_data.Rating.max()))

#Check for missing values
print('Number of missing values across columns: \n',electronics_data.isnull().sum())


# Check the distribution of the rating
with sns.axes_style('white'):
    g = sns.factorplot("Rating", data=electronics_data, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings")


print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",electronics_data.shape[0])
print("Total No of Users   :", len(np.unique(electronics_data.userId)))
print("Total No of products  :", len(np.unique(electronics_data.productId)))


#Dropping the Timestamp column

electronics_data.drop(['timestamp'], axis=1,inplace=True)

#Analysis of rating given by the user 

no_of_rated_products_per_user = electronics_data.groupby(by='userId')['Rating'].count().sort_values(ascending=False)

no_of_rated_products_per_user.head()

no_of_rated_products_per_user.describe()

quantiles = no_of_rated_products_per_user.quantile(np.arange(0,1.01,0.01), interpolation='higher')

plt.figure(figsize=(10,10))
plt.title("Quantiles and their Values")
quantiles.plot()
# quantiles with 0.05 difference
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
# quantiles with 0.25 difference
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings by user')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')
plt.show()

print('\n No of rated product more than 50 per user : {}\n'.format(sum(no_of_rated_products_per_user >= 50)) )

#Getting the new dataframe which contains users who has given 50 or more ratings

new_df=electronics_data.groupby("productId").filter(lambda x:x['Rating'].count() >=50)

no_of_ratings_per_product = new_df.groupby(by='productId')['Rating'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_product.values)
plt.title('# RATINGS per Product')
plt.xlabel('Product')
plt.ylabel('No of ratings per product')
ax.set_xticklabels([])

plt.show()


#Average rating of the product 

new_df.groupby('productId')['Rating'].mean().head()


new_df.groupby('productId')['Rating'].mean().sort_values(ascending=False).head()

#Total no of rating for product

new_df.groupby('productId')['Rating'].count().sort_values(ascending=False).head()

ratings_mean_count = pd.DataFrame(new_df.groupby('productId')['Rating'].mean())

ratings_mean_count['rating_counts'] = pd.DataFrame(new_df.groupby('productId')['Rating'].count())

ratings_mean_count.head()
ratings_mean_count['rating_counts'].max()
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean_count['rating_counts'].hist(bins=50)

plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean_count['Rating'].hist(bins=50)


plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
sns.jointplot(x='Rating', y='rating_counts', data=ratings_mean_count, alpha=0.4)

popular_products = pd.DataFrame(new_df.groupby('productId')['Rating'].count())
most_popular = popular_products.sort_values('Rating', ascending=False)
most_popular.head(30).plot(kind = "bar")



from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split


#Reading the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(new_df,reader)


#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)


# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)



# run the trained model against the testset
test_pred = algo.test(testset)


# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)






new_df1=new_df.head(10000)
ratings_matrix = new_df1.pivot_table(values='Rating', index='userId', columns='productId', fill_value=0)
ratings_matrix.head()
ratings_matrix.shape


X = ratings_matrix.T
X.head()

X.shape
X1 = X

#Decomposing the Matrix
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape


#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape


X.index[75]

i = "B00000K135"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID


correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

Recommend = list(X.index[correlation_product_ID > 0.65])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:24]









In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

# Split
from sklearn.model_selection import train_test_split

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv",
                             names=['userId', 'productId','rating','timestamp'])

df.head()

print("Total Reviews:",df.shape[0])
print("Total Columns:",df.shape[1])

# Taking subset of the dataset
df = df.iloc[:5000,0:]

print("Total Reviews:",df.shape[0])
print("Total Columns:",df.shape[1])

print("Total number of ratings :",df.rating.nunique())
print("Total number of users   :", df.userId.nunique())
print("Total number of products  :", df.productId.nunique())

df.info()

# Check missing value
df.isnull().sum()

# Check Duplicate data
df[df.duplicated()].any()

# rating describe summary 
df.describe()['rating']

print("Unique value of Rating:",df.rating.unique())

# Find the minimum and maximum ratings
print('Minimum rating is: %d' %(df.rating.min()))
print('Maximum rating is: %d' %(df.rating.max()))

# Average rating of products
ratings = pd.DataFrame(df.groupby('productId')['rating'].mean())
ratings['ratings_count'] = pd.DataFrame(df.groupby('productId')['rating'].count())
ratings['ratings_average'] = pd.DataFrame(df.groupby('productId')['rating'].mean())
ratings.head(10)

plt.figure(figsize=(10,4))
ratings['rating'].hist(bins=70)

sns.jointplot(x='rating',y='ratings_count',data=ratings,alpha=0.5)

# Most top 30 products
popular_products = pd.DataFrame(df.groupby('productId')['rating'].count())
most_popular = popular_products.sort_values('rating', ascending=False)
most_popular.head(30).plot(kind = "bar",figsize=(12, 4))

vote_counts = ratings[ratings['ratings_count'].notnull()]['ratings_count'].astype('int')
vote_averages = ratings[ratings['ratings_average'].notnull()]['ratings_average'].astype('int')
C = vote_averages.mean()
print("Average rating of product across the whole dataset is",C)

m = vote_counts.quantile(0.95)
print("Minimum votes required to be listed in the chart is",m)

ratings.head()

qualified = ratings[(ratings['ratings_count'] >= m) & (ratings['ratings_count'].notnull()) & (ratings['ratings_average'].notnull())][['ratings_count', 'ratings_average']]

qualified['ratings_count'] = qualified['ratings_count'].astype('int')
qualified['ratings_average'] = qualified['ratings_average'].astype('int')
qualified.head().sort_values(by='ratings_count', ascending=False)

qualified.shape

def weighted_rating(x):
    v = x['ratings_count']
    R = x['ratings_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(20)
qualified.head(10)

# Add color
from matplotlib import cm
color = cm.inferno_r(np.linspace(.4, .8, 30))

rating_plot_count = qualified['ratings_count'].plot.bar(figsize=(12, 4),color=color)
rating_plot_count.set_title("Rating Count Bar-Plot")
rating_plot_count.set_xlabel("productId")
rating_plot_count.set_ylabel("Count")


rating_plot_avg = qualified['ratings_average'].plot.bar(figsize=(12, 4),color=color)
rating_plot_avg.set_title("Rating Average Bar-Plot")
rating_plot_avg.set_xlabel("productId")
rating_plot_avg.set_ylabel("rating")

wr_plot = qualified['wr'].plot.bar(figsize=(12, 4),color=color)
wr_plot.set_title("Weight Rating Bar-Plot")
wr_plot.set_xlabel("productId")
wr_plot.set_ylabel("rating")





reader = Reader()
df.head()

data = Dataset.load_from_df(df[['userId', 'productId', 'rating']], reader)

# Use the famous SVD algorithm
svd = SVD()

# Run 5-fold cross-validation and then print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
svd.fit(trainset)

df[df['userId'] == 'AKM1MP6P0OYPR']

svd.predict(uid='A17HMM1M7T9PJ1', iid='0970407998', r_ui=None)
svd.predict(uid='A17HMM1M7T9PJ1', iid='0970407998', r_ui=None).est


df_users=df.groupby('userId').filter(lambda x: x['rating'].count()>=50)
df_users.head()
df_users.shape


matrix=pd.pivot_table(data=df_users, values='rating', index='userId',columns='productId')
matrix.head()

# Function that takes in productId and useId as input and outputs up to 5 most similar products.
def hybrid_recommendations(userId, productId):
    
    # Get the Id of the top five products that are correlated with the ProductId chosen by the user.
    top_five=matrix.corrwith(matrix[productId]).sort_values(ascending=False).head(5)
    
    # Predict the ratings the user might give to these top 5 most correlated products.
    est_rating=[]
    for x in list(top_five.index):
        if str(top_five[x])!='nan':
            est_rating.append(svd.predict(userId, iid=x, r_ui=None).est)
           
    return pd.DataFrame({'productId':list(top_five.index)[:len(est_rating)], 'estimated_rating':est_rating}).sort_values(by='estimated_rating', ascending=False).reset_index(drop=True)


hybrid_recommendations('A2NYK9KWFMJV4Y', 'B00LI4ZZO8')

# df.head()

# df['userId'].value_counts()

# # Check specific userId review
# df[df['userId'] == 'A3LDPF5FMB782Z']

# # predict based on this data
# svd.predict('A3LDPF5FMB782Z', '140053271X', 5.0)





In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
# Build a model.
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = tf.keras.Sequential([
                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_userIds, mask_token=None),
                                        # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_userIds)+1, embedding_dimension)
                                    ])

        self.product_embeddings = tf.keras.Sequential([
                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_productIds, mask_token=None),
                                    # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_productIds)+1, embedding_dimension)
                                    ])
        # Set up a retrieval task and evaluation metrics over the
        # entire dataset of candidates.
        self.ratings = tf.keras.Sequential([
                            tf.keras.layers.Dense(256, activation="relu"),
                            tf.keras.layers.Dense(64,  activation="relu"),
                            tf.keras.layers.Dense(1)
                              ])
    def call(self, userId, productId):
        user_embeddings  = self.user_embeddings (userId)
        product_embeddings = self.product_embeddings(productId)
        return self.ratings(tf.concat([user_embeddings,product_embeddings], axis=1))

# Build a model.
class amazonModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer   = tfrs.tasks.Ranking(
                                                    loss    =  tf.keras.losses.MeanSquaredError(),
                                                    metrics = [tf.keras.metrics.RootMeanSquaredError()])
            

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["userId"], features["productId"]  )

        return self.task( labels=features["rating"], predictions=rating_predictions)
    





cutoff_no_rat = 50    ## Only count products which received more than or equal 50
cutoff_year   = 2011  ## Only count Rating after 2011
recent_data   = data_by_date.loc[data_by_date["year"] > cutoff_year]
print("Number of Rating: {:,}".format(recent_data.shape[0]) )
print("Number of Users: {:,}".format(len(recent_data.userId.unique()) ) )
print("Number of Products: {:,}".format(len(recent_data.productId.unique())  ) )
del data_by_date  ### Free up memory ###
recent_prod   = recent_data.loc[recent_data.groupby("productId")["rating"].transform('count').ge(cutoff_no_rat)].reset_index(
                    drop=True).drop(["timestamp","year","month"],axis=1)
del recent_data  ### Free up memory ###


userIds    = recent_prod.userId.unique()
productIds = recent_prod.productId.unique()
total_ratings= len(recent_prod.index)


ratings = tf.data.Dataset.from_tensor_slices( {"userId":tf.cast( recent_prod.userId.values  ,tf.string),
                                "productId":tf.cast( recent_prod.productId.values,tf.string),
                                "rating":tf.cast( recent_prod.rating.values  ,tf.int8,) } )


tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take( int(total_ratings*0.8) )
test = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))

unique_productIds = productIds
unique_userIds    = userIds


model = amazonModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad( learning_rate=0.1 ))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=10)


# Evaluate.
model.evaluate(cached_test, return_dict=True)


user_rand = userIds[123]
test_rating = {}
for m in test.take(5):
    test_rating[m["productId"].numpy()]=RankingModel()(tf.convert_to_tensor([user_rand]),tf.convert_to_tensor([m["productId"]]))



print("Top 5 recommended products for User {}: ".format(user_rand))
for m in sorted(test_rating, key=test_rating.get, reverse=True):
    print(m.decode())






In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

sns.set_style("ticks")

# Import both datasets

df5=pd.read_csv('../input/amazon-cell-phones-reviews/20191226-items.csv')
df6=pd.read_csv('../input/amazon-cell-phones-reviews/20191226-reviews.csv')

df5.rename(columns={'rating':'avgRating'}, inplace=True)
columns = ['url', 'reviewUrl', 'totalReviews', 'originalPrice']
df5.drop(columns, inplace=True, axis=1)
# Drop uneeded columns before merging both datasets

columns = ['date', 'verified', 'title', 'body', 'helpfulVotes']
df6.drop(columns, inplace=True, axis=1)

df6
# Merging 2 df to create training data for the SVD

ratings = pd.merge(df5, df6, how='inner', on='asin')

columns = ['brand', 'price', 'image', 'avgRating']
ratings.drop(columns, inplace=True, axis=1)

ratings = ratings[['name', 'asin', 'title', 'rating']]
ratings = ratings.sort_values(by=['name'], ascending=True)
ratings = ratings.reset_index(drop=True)

ratings


# Create a pivot table to see how the data columns correspond to one another

matrix=pd.pivot_table(data=ratings[['name', 'asin', 'rating']], values='rating', index='name',columns='asin')
matrix.head()

# Initialize the SVD model and train the model on the created dataset

svd = SVD()
reader = Reader()
data = Dataset.load_from_df(ratings[['name', 'asin', 'rating']], reader)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
svd.fit(trainset)

# Function that takes in productId and userId as input and outputs up to 5 most similar products.
def hybrid_recommendations(userId, productId):
    
    # Get the Id of the top five products that are correlated with the ProductId chosen by the user.
    top_five=matrix.corrwith(matrix[productId]).sort_values(ascending=False).head(15)
    
    # Predict the ratings the user might give to these top 5 most correlated products.
    est_rating=[]
    for x in list(top_five.index):
        if str(top_five[x])!='nan':
            est_rating.append(svd.predict(userId, iid=x, r_ui=None).est)
           
    return pd.DataFrame({'productId':list(top_five.index)[:len(est_rating)], 'estimated_rating':est_rating}).sort_values(by='estimated_rating', ascending=False).reset_index(drop=True)


find_product = hybrid_recommendations('John', 'B018OMP8ES')

# Function that maps the productId to the productName
def product_mapping(recommender_df): 
    rows_list = []
    rows_list2 = []

    # Append productName into finalDf
    for x in recommender_df['productId']:
        cut_row = ratings.loc[ratings['asin'] == x]
        cut_row = cut_row['title'][0:1].values
        rows_list.append(cut_row[0])

    # Copy over the estimated ratings into the finalDf
    for i in recommender_df['estimated_rating']:
        rows_list2.append(i)

    # Creating the finalDf
    cut_dict = {"product": rows_list, "estimated_rating": rows_list2}
    cut_df = pd.DataFrame(cut_dict)
    
    return cut_df

# Test 1
product_mapping(find_product)

# Test 2
product_mapping(hybrid_recommendations('Peter ', 'B077T4MVZ6'))

# Test 3
product_mapping(hybrid_recommendations('Sarah ', 'B081H6STQQ'))



# Save the model into a joblib file
from joblib import dump, load

dump(svd, 'svd.joblib')









In [None]:
# To compute the accuracy of models
from surprise import accuracy

# Class is used to parse a file containing ratings, data should be in structure - user ; item ; rating
from surprise.reader import Reader

# Class for loading datasets
from surprise.dataset import Dataset

# For tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# For splitting the rating data in train and test datasets
from surprise.model_selection import train_test_split

# For implementing similarity-based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# For implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# for implementing K-Fold cross-validation
from surprise.model_selection import KFold

# For implementing clustering-based recommendation system
from surprise import CoClustering


def precision_recall_at_k(model, k = 10, threshold = 3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user
    user_est_true = defaultdict(list)

    # Making predictions on the test data
    predictions = model.test(testset)

    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key = lambda x: x[0], reverse = True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. Therefore, we are setting Precision to 0 when n_rec_k is 0

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. Therefore, we are setting Recall to 0 when n_rel is 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    # Mean of all the predicted precisions are calculated.
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)), 3)

    # Mean of all the predicted recalls are calculated.
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)), 3)

    accuracy.rmse(predictions)

    print('Precision: ', precision) # Command to print the overall precision

    print('Recall: ', recall) # Command to print the overall recall

    print('F_1 score: ', round((2*precision*recall)/(precision+recall), 3)) # Formula to compute the F-1 score



# Instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(1, 5))

# Loading the rating dataset
df = Dataset.load_from_df(df[['user_id', 'prod_id', 'rating']], reader)

# Splitting the data into train and test datasets
trainset, testset = train_test_split(df, test_size=0.7, random_state=42)



# Declaring the similarity options
sim_options = {'name': 'cosine', 'user_based': True}

# Initialize the KNNBasic model using sim_options declared, Verbose = False, and setting random_state = 1
algo_knn_user = KNNBasic(sim_options=sim_options, verbose=False, random_state=1)

# Fit the model on the training data
algo_knn_user.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score using the precision_recall_at_k function defined above
precision_recall_at_k(algo_knn_user)


# Predicting rating for a sample user with an interacted product
algo_knn_user.predict('A3LDPF5FMB782Z', '1400501466', r_ui=5, verbose=True)


# Unique user_id where prod_id is not equal to "1400501466"
df_final.loc[df_final['prod_id'] != "1400501466", 'user_id'].unique()


# Predicting rating for a sample user with a non interacted product
algo_knn_user.predict('A34BZM6S9L7QI4', '1400501466', verbose=True)

# Setting up parameter grid to tune the hyperparameters
param_grid = {'k': [20, 30, 40], 'min_k': [3, 6, 9],
              'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                              'user_based': [True]}
              }

# Performing 3-fold cross-validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# Fitting the data
gs.fit(df)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])



# Using the optimal similarity measure for user-user based collaborative filtering
sim_options = {'name': 'cosine',
               'user_based': True}

# Creating an instance of KNNBasic with optimal hyperparameter values
similarity_algo_optimized = KNNBasic(sim_options=sim_options, k=40, min_k=6, verbose=False, random_state=1)

# Training the algorithm on the trainset
similarity_algo_optimized.fit(trainset)

# Let us compute precision@k and recall@k also with k =10
precision_recall_at_k(similarity_algo_optimized)

# sim_user_user_optimized model to recommend for userId "A3LDPF5FMB782Z" and productId 1400501466
similarity_algo_optimized.predict('A3LDPF5FMB782Z', '1400501466', r_ui=5, verbose=True)


# sim_user_user_optimized model to recommend for userId "A34BZM6S9L7QI4" and productId "1400501466"
similarity_algo_optimized.predict('A34BZM6S9L7QI4', '1400501466', verbose=True)


# 0 is the inner id of the above user
similarity_algo_optimized.get_neighbors(0, k=5)


def get_recommendations(data, user_id, top_n, algo):

    # Creating an empty list to store the recommended product ids
    recommendations = []

    # Creating an user item interactions matrix
    user_item_interactions_matrix = data.pivot(index = 'user_id', columns = 'prod_id', values = 'rating')

    # Extracting those product ids which the user_id has not interacted yet
    non_interacted_products = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()

    # Looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_products:

        # Predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_id).est

        # Appending the predicted ratings
        recommendations.append((item_id, est))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key = lambda x: x[1], reverse = True)

    return recommendations[:top_n] # Returing top n highest predicted rating products for this user


# Making top 5 recommendations for user_id "A3LDPF5FMB782Z" with a similarity-based recommendation engine
recommendations = get_recommendations(df_final, 'A3LDPF5FMB782Z', 5, algo_knn_user)


# Building the dataframe for above recommendations with columns "prod_id" and "predicted_ratings"
pd.DataFrame(recommendations, columns=['prod_Id', 'predicted_ratings'])




In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MaxNLocator
from learntools.time_series.utils import plot_periodogram, seasonal_plot
from learntools.time_series.style import *
import seaborn as sns
from IPython.display import Markdown, display
from pathlib import Path
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Function to display markdown
def printmd(string):
    display(Markdown(string))

# Define directory and read data
comp_dir = Path('../input/amazon-product-reviews')
electronics_data = pd.read_csv(
    comp_dir / "ratings_Electronics (1).csv",
    dtype={'rating': 'int8'},
    names=['userId', 'productId', 'rating', 'timestamp'],
    index_col=None,
    header=0
)

# Display some basic information
printmd(f"Number of Rating: {electronics_data.shape[0]:,}")
printmd(f"Columns: {np.array2string(electronics_data.columns.values)}")
printmd(f"Number of Users: {len(electronics_data.userId.unique()):,}")
printmd(f"Number of Products: {len(electronics_data.productId.unique()):,}")
electronics_data.describe()['rating'].reset_index()

# Check for missing values
printmd('**Number of missing values**:')
pd.DataFrame(
    electronics_data.isnull().sum().reset_index()
).rename(columns={0: "Total missing", "index": "Columns"})

# Process data by date
data_by_date = electronics_data.copy()
data_by_date.timestamp = pd.to_datetime(electronics_data.timestamp, unit="s")
data_by_date = data_by_date.sort_values(by="timestamp", ascending=False).reset_index(drop=True)

printmd("Number of Ratings each day:")
data_by_date.groupby("timestamp")["rating"].count().tail(10).reset_index()

# Add year and month columns
data_by_date["year"] = data_by_date.timestamp.dt.year
data_by_date["month"] = data_by_date.timestamp.dt.month
rating_by_year = data_by_date.groupby(["year", "month"])["rating"].count().reset_index()

# Create date column and plot data
rating_by_year["date"] = pd.to_datetime(rating_by_year["year"].astype(str) + "-" + rating_by_year["month"].astype(str) + "-1")
rating_by_year.plot(x="date", y="rating")
plt.title("Number of Ratings over Years")
plt.show()

# Group by product and calculate statistics
rating_by_product = electronics_data.groupby("productId").agg({
    "userId": "count",
    "rating": "mean"
}).rename(columns={"userId": "Number of Ratings", "rating": "Average Rating"}).reset_index()

# Filter top-rated products
cutoff = 50
top_rated = rating_by_product.loc[
    rating_by_product["Number of Ratings"] > cutoff
].sort_values(by="Average Rating", ascending=False).reset_index(drop=True)

# Define TensorFlow model
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Embedding layers for users and products
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=unique_userIds, mask_token=None),
            tf.keras.layers.Embedding(len(unique_userIds) + 1, embedding_dimension)
        ])

        self.product_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=unique_productIds, mask_token=None),
            tf.keras.layers.Embedding(len(unique_productIds) + 1, embedding_dimension)
        ])

        # Ratings layers
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])

    def call(self, userId, productId):
        user_embeddings = self.user_embeddings(userId)
        product_embeddings = self.product_embeddings(productId)
        return self.ratings(tf.concat([user_embeddings, product_embeddings], axis=1))

class amazonModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model = RankingModel()
        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["userId"], features["productId"])
        return self.task(labels=features["rating"], predictions=rating_predictions)

# Filter recent data
cutoff_no_rat = 50
cutoff_year = 2011
recent_data = data_by_date.loc[data_by_date["year"] > cutoff_year]
print(f"Number of Rating: {recent_data.shape[0]:,}")
print(f"Number of Users: {len(recent_data.userId.unique()):,}")
print(f"Number of Products: {len(recent_data.productId.unique()):,}")
del data_by_date  # Free up memory

recent_prod = recent_data.loc[
    recent_data.groupby("productId")["rating"].transform('count').ge(cutoff_no_rat)
].reset_index(drop=True).drop(["timestamp", "year", "month"], axis=1)
del recent_data  # Free up memory

# Prepare data for training
userIds = recent_prod.userId.unique()
productIds = recent_prod.productId.unique()
total_ratings = len(recent_prod.index)

ratings = tf.data.Dataset.from_tensor_slices({
    "userId": tf.cast(recent_prod.userId.values, tf.string),
    "productId": tf.cast(recent_prod.productId.values, tf.string),
    "rating": tf.cast(recent_prod.rating.values, tf.int8)
})

# Shuffle and split data
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))

unique_productIds = productIds
unique_userIds = userIds

# Compile and train model
model = amazonModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=10)

# Evaluate model
model.evaluate(cached_test, return_dict=True)

# Recommend products for a random user
user_rand = userIds[123]
test_rating = {}
for m in test.take(5):
    test_rating[m["productId"].numpy()] = RankingModel()(
        tf.convert_to_tensor([user_rand]), tf.convert_to_tensor([m["productId"]])
    )

print(f"Top 5 recommended products for User {user_rand}:")
for m in sorted(test_rating, key=test_rating.get, reverse=True):
    print(m.decode())


In [None]:
catalogo['complexity'] = catalogo[columnas_a_transformar].apply(lambda x: int(''.join(map(str, x)), 2), axis=1)
catalogo['complexity'] = catalogo[columnas_a_transformar].apply(lambda x: int(''.join(map(str, x)), 2), axis=1)

# Recomendadores de TensorFlow para un sistema de recomendaciones potente
- [Articulo](https://medium.com/@pauloyc/tensorflow-recommenders-for-powerful-recommendation-system-e3dec138a07f)


# TensorFlow Recommenders for powerful recommendation system
- [Articulo](https://medium.com/@pauloyc/tensorflow-recommenders-for-powerful-recommendation-system-e3dec138a07f)


# Construyendo un sistema de recomendación de películas con Surprise y Python.
- [Articulo](https://monirah-abdulaziz.medium.com/building-movie-recommendation-system-with-surprise-and-python-e905de755c61)

In [1]:
# df_series_duration = merged_df_series.groupby(‘program_name’)[[‘duration_seconds’]].max()

# # average_watching
# merged_df_series[“average_watching”] = merged_df_series.apply(lambda x: 1 if x[‘duration_seconds’] > df_series_duration.loc[x.program_name,’duration_seconds’] else x[‘duration_seconds’]/df_series_duration.loc[x.program_name,’duration_seconds’], axis=1)
# # total_duration 
# merged_df_series[“total_duration”]= merged_df_series.apply(lambda x: df_series_duration.loc[x.program_name,’duration_seconds’],axis=1)


# # Extract minute by using (regex) and convert to appropriate type 
# merged_df_movie[‘total_duration’] = merged_df_movie[‘duration’].str.replace(r’min’, ‘’)
# merged_df_movie[‘duration_seconds’] = pd.to_numeric((merged_df_movie[‘duration_seconds’]) , errors=’coerce’).astype(‘Int64’)
# merged_df_movie[‘total_duration’] = pd.to_numeric((merged_df_movie[‘total_duration’]) , errors=’coerce’).astype(‘Int64’)

# # convert from min to sec
# merged_df_movie[‘total_duration’] = (merged_df_movie[‘total_duration’]*60)

# merged_df_movie[“duration_seconds”] = merged_df_movie.apply(lambda x: x[‘total_duration’] if x[‘duration_seconds’] > x[‘total_duration’] else x[‘duration_seconds’], axis=1)

# merged_df_movie[‘average_watching’]=merged_df_movie[‘duration_seconds’]/merged_df_movie[‘total_duration’]

# reader = Reader(rating_scale=(0.03, 1.0))
# data = Dataset.load_from_df(df_data[[‘user_id’, ‘show_id’, ‘average_watching’]], reader)
# benchmark = []

# # Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(),
#  KNNWithMeans(), KNNWithZScore(), BaselineOnly()]:
#  # Perform cross validation
#  results = cross_validate(algorithm, data, measures=[‘RMSE’], cv=3, verbose=False)
 
#  # Get results & append algorithm name
#  tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#  tmp = tmp.append(pd.Series([str(algorithm).split(‘ ‘)[0].split(‘.’)[-1]], index=[‘Algorithm’]))
#  benchmark.append(tmp)


# print(‘Using ALS’)
# bsl_options = {‘method’: ‘als’,
#  “random_state”:250,
#  ‘n_epochs’: 5,
#  ‘reg_u’: 12,
#  ‘reg_i’: 5
#  }
# algo = BaselineOnly(bsl_options)
# cross_validate(algo, data, measures=[‘RMSE’], cv=3, verbose=False)


# trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)


# # user_id is the 13618
# ratings = newdf.loc[newdf[‘user_id’] == 13618]
# # obtain the required data of this user
# ratings=ratings[[‘user_id’, ‘show_id’, ‘average_watching’]]
# ratings

# # user_id is the 13618
# ratings = newdf.loc[newdf[‘user_id’] == 13618]
# # obtain the required data of this user
# ratings=ratings[[‘user_id’, ‘show_id’, ‘average_watching’]]
# ratings

# # get the list of the movie ids
# unique_ids = newdf[‘show_id’].unique()
# # get the list of the ids that the userid 13618 has watched
# iids1001 = newdf.loc[newdf[‘user_id’]==13618, ‘show_id’]
# # remove the rated movies for the recommendations
# movies_to_predict = np.setdiff1d(unique_ids,iids1001)

# algo = BaselineOnly(bsl_options)
# algo.fit(data.build_full_trainset())
# my_recs = []
# for iid in movies_to_predict:
#  my_recs.append((iid, algo.predict(uid=’13618',iid=iid).est))
# pd.DataFrame(my_recs, columns=[‘iid’, ‘predictions’]).sort_values(‘predictions’, ascending=False).head(10)

# The Magic of Recommendation Systems: How Netflix Knows What You Want to Watch
- [Articulo](https://www.linkedin.com/pulse/magic-recommendation-systems-how-netflix-knows-what-you-sachin-b9gzc/)

In [2]:
# # Sample code for user-based collaborative filtering
# from sklearn.metrics.pairwise import cosine_similarity

# # Calculate user similarity matrix
# user_similarity = cosine_similarity(user_ratings)

# # Predict a user's ratings based on similar users
# def predict_ratings(user_ratings, user_similarity):
#     return user_similarity.dot(user_ratings) / np.abs(user_similarity).sum(axis=1) 


# # Sample code for item-based collaborative filtering
# item_similarity = cosine_similarity(item_ratings.T)

# # Predict ratings for an item based on similar items
# def predict_item_ratings(item_ratings, item_similarity):
#     return item_ratings.dot(item_similarity) / np.abs(item_similarity).sum(axis=1) 


# # Sample code for content-based filtering
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import linear_kernel

# # Vectorize item descriptions using TF-IDF
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# tfidf_matrix = tfidf_vectorizer.fit_transform(item_descriptions)

# # Calculate similarity between items based on descriptions
# item_similarity_content = linear_kernel(tfidf_matrix, tfidf_matrix)


# # Sample code for a hybrid recommendation model
# hybrid_similarity = alpha * user_similarity + (1 - alpha) * item_similarity_content

# # Predict ratings using the hybrid similarity matrix
# def predict_hybrid_ratings(user_ratings, hybrid_similarity):
#     return hybrid_similarity.dot(user_ratings) / np.abs(hybrid_similarity).sum(axis=1) 



# # Sample code for a neural collaborative filtering model
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

# # Define neural collaborative filtering model
# user_input = Input(shape=(1,))
# item_input = Input(shape=(1,))
# user_embedding = Embedding(num_users, embedding_size)(user_input)
# item_embedding = Embedding(num_items, embedding_size)(item_input)
# merged_embeddings = Concatenate()([user_embedding, item_embedding])
# flatten = Flatten()(merged_embeddings)
# dense_layer = Dense(128, activation='relu')(flatten)
# output_layer = Dense(1)(dense_layer)

# neural_collab_filter = Model(inputs=[user_input, item_input], outputs=output_layer)




# Building and Testing Recommender Systems With Surprise, Step-By-Step
- [Articulo](https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b)
- [Notebook](https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Building%20Recommender%20System%20with%20Surprise.ipynb)

In [None]:
# user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# user.columns = ['userID', 'Location', 'Age']
# rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# rating.columns = ['userID', 'ISBN', 'bookRating']
# df = pd.merge(user, rating, on='userID', how='inner')
# df.drop(['Location', 'Age'], axis=1, inplace=True)
# df.head()



# from plotly.offline import init_notebook_mode, plot, iplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)

# data = df['bookRating'].value_counts().sort_index(ascending=False)
# trace = go.Bar(x = data.index,
#                text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
#                textposition = 'auto',
#                textfont = dict(color = '#000000'),
#                y = data.values,
#                )
# # Create layout
# layout = dict(title = 'Distribution Of {} book-ratings'.format(df.shape[0]),
#               xaxis = dict(title = 'Rating'),
#               yaxis = dict(title = 'Count'))
# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)





# # Number of ratings per book
# data = df.groupby('ISBN')['bookRating'].count().clip(upper=50)

# # Create trace
# trace = go.Histogram(x = data.values,
#                      name = 'Ratings',
#                      xbins = dict(start = 0,
#                                   end = 50,
#                                   size = 2))
# # Create layout
# layout = go.Layout(title = 'Distribution Of Number of Ratings Per Book (Clipped at 100)',
#                    xaxis = dict(title = 'Number of Ratings Per Book'),
#                    yaxis = dict(title = 'Count'),
#                    bargap = 0.2)

# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)



# df.groupby('ISBN')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]





# # Number of ratings per user
# data = df.groupby('userID')['bookRating'].count().clip(upper=50)

# # Create trace
# trace = go.Histogram(x = data.values,
#                      name = 'Ratings',
#                      xbins = dict(start = 0,
#                                   end = 50,
#                                   size = 2))
# # Create layout
# layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
#                    xaxis = dict(title = 'Ratings Per User'),
#                    yaxis = dict(title = 'Count'),
#                    bargap = 0.2)

# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)



# df.groupby('userID')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]



# min_book_ratings = 50
# filter_books = df['ISBN'].value_counts() > min_book_ratings
# filter_books = filter_books[filter_books].index.tolist()

# min_user_ratings = 50
# filter_users = df['userID'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# df_new = df[(df['ISBN'].isin(filter_books)) & (df['userID'].isin(filter_users))]
# print('The original data frame shape:\t{}'.format(df.shape))
# print('The new data frame shape:\t{}'.format(df_new.shape))



# reader = Reader(rating_scale=(0, 9))
# data = Dataset.load_from_df(df_new[['userID', 'ISBN', 'bookRating']], reader)



# benchmark = []
# # Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
    
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    



# print('Using ALS')
# bsl_options = {'method': 'als',
#                'n_epochs': 5,
#                'reg_u': 12,
#                'reg_i': 5
#                }
# algo = BaselineOnly(bsl_options=bsl_options)
# cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)  



# trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options=bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)





# def get_Iu(uid):
#     """ return the number of items rated by given user
#     args: 
#       uid: the id of the user
#     returns: 
#       the number of items rated by the user
#     """
#     try:
#         return len(trainset.ur[trainset.to_inner_uid(uid)])
#     except ValueError: # user was not part of the trainset
#         return 0
    
# def get_Ui(iid):
#     """ return number of users that have rated given item
#     args:
#       iid: the raw id of the item
#     returns:
#       the number of users that have rated the item.
#     """
#     try: 
#         return len(trainset.ir[trainset.to_inner_iid(iid)])
#     except ValueError:
#         return 0
    
# df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
# df['Iu'] = df.uid.apply(get_Iu)
# df['Ui'] = df.iid.apply(get_Ui)
# df['err'] = abs(df.est - df.rui)
# best_predictions = df.sort_values(by='err')[:10]
# worst_predictions = df.sort_values(by='err')[-10:]



# import matplotlib.pyplot as plt
# %matplotlib notebook
# df_new.loc[df_new['ISBN'] == '055358264X']['bookRating'].hist()
# plt.xlabel('rating')
# plt.ylabel('Number of ratings')
# plt.title('Number of ratings book ISBN 055358264X has received')
# plt.show();

In [None]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.preprocessing import MinMaxScaler

# rating = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# user = pd.read_csv('data/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book = pd.read_csv('data/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book_rating = pd.merge(rating, book, on='ISBN')
# cols = ['Year-Of-Publication', 'Publisher', 'Book-Author', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
# book_rating.drop(cols, axis=1, inplace=True)

# rating_count = (book_rating.
#      groupby(by = ['Book-Title'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_book'})
#      [['Book-Title', 'RatingCount_book']]
#     )
    
# threshold = 25
# rating_count = rating_count.query('RatingCount_book >= @threshold')

# user_rating = pd.merge(rating_count, book_rating, left_on='Book-Title', right_on='Book-Title', how='left')

# user_count = (user_rating.
#      groupby(by = ['User-ID'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_user'})
#      [['User-ID', 'RatingCount_user']]
#     )
    
# threshold = 20
# user_count = user_count.query('RatingCount_user >= @threshold')

# combined = user_rating.merge(user_count, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')

# print('Number of unique books: ', combined['Book-Title'].nunique())
# print('Number of unique users: ', combined['User-ID'].nunique())


# scaler = MinMaxScaler()
# combined['Book-Rating'] = combined['Book-Rating'].values.astype(float)
# rating_scaled = pd.DataFrame(scaler.fit_transform(combined['Book-Rating'].values.reshape(-1,1)))
# combined['Book-Rating'] = rating_scaled


# combined = combined.drop_duplicates(['User-ID', 'Book-Title'])
# user_book_matrix = combined.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
# user_book_matrix.fillna(0, inplace=True)
# users = user_book_matrix.index.tolist()
# books = user_book_matrix.columns.tolist()
# user_book_matrix = user_book_matrix.as_matrix()


# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()


# num_input = combined['Book-Title'].nunique()
# num_hidden_1 = 10
# num_hidden_2 = 5

# X = tf.placeholder(tf.float64, [None, num_input])

# weights = {
#     'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
#     'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
#     'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
#     'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
# }

# biases = {
#     'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
#     'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
# }



# def encoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
#     return layer_2

# def decoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
#     return layer_2


# encoder_op = encoder(X)
# decoder_op = decoder(encoder_op)
# y_pred = decoder_op
# y_true = X


# loss = tf.losses.mean_squared_error(y_true, y_pred)
# optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
# eval_x = tf.placeholder(tf.int32, )
# eval_y = tf.placeholder(tf.int32, )
# pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

# init = tf.global_variables_initializer()
# local_init = tf.local_variables_initializer()
# pred_data = pd.DataFrame()




# with tf.Session() as session:
#     epochs = 100
#     batch_size = 35

#     session.run(init)
#     session.run(local_init)

#     num_batches = int(user_book_matrix.shape[0] / batch_size)
#     user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
#     for i in range(epochs):

#         avg_cost = 0
#         for batch in user_book_matrix:
#             _, l = session.run([optimizer, loss], feed_dict={X: batch})
#             avg_cost += l

#         avg_cost /= num_batches

#         print("epoch: {} Loss: {}".format(i + 1, avg_cost))

#     user_book_matrix = np.concatenate(user_book_matrix, axis=0)

#     preds = session.run(decoder_op, feed_dict={X: user_book_matrix})

#     pred_data = pred_data.append(pd.DataFrame(preds))

#     pred_data = pred_data.stack().reset_index(name='Book-Rating')
#     pred_data.columns = ['User-ID', 'Book-Title', 'Book-Rating']
#     pred_data['User-ID'] = pred_data['User-ID'].map(lambda value: users[value])
#     pred_data['Book-Title'] = pred_data['Book-Title'].map(lambda value: books[value])
    
#     keys = ['User-ID', 'Book-Title']
#     index_1 = pred_data.set_index(keys).index
#     index_2 = combined.set_index(keys).index

#     top_ten_ranked = pred_data[~index_1.isin(index_2)]
#     top_ten_ranked = top_ten_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
#     top_ten_ranked = top_ten_ranked.groupby('User-ID').head(10)


# top_ten_ranked.loc[top_ten_ranked['User-ID'] == 278582]

# book_rating.loc[book_rating['User-ID'] == 278582].sort_values(by=['Book-Rating'], ascending=False)

# Building A Collaborative Filtering Recommender System with TensorFlow
- [Building A Collaborative Filtering Recommender System with TensorFlow](https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-tensorflow-82e63d27b420)

In [3]:

# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.preprocessing import MinMaxScaler

# rating = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# user = pd.read_csv('data/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book = pd.read_csv('data/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book_rating = pd.merge(rating, book, on='ISBN')
# cols = ['Year-Of-Publication', 'Publisher', 'Book-Author', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
# book_rating.drop(cols, axis=1, inplace=True)

# rating_count = (book_rating.
#      groupby(by = ['Book-Title'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_book'})
#      [['Book-Title', 'RatingCount_book']]
#     )
    
# threshold = 25
# rating_count = rating_count.query('RatingCount_book >= @threshold')

# user_rating = pd.merge(rating_count, book_rating, left_on='Book-Title', right_on='Book-Title', how='left')

# user_count = (user_rating.
#      groupby(by = ['User-ID'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_user'})
#      [['User-ID', 'RatingCount_user']]
#     )
    
# threshold = 20
# user_count = user_count.query('RatingCount_user >= @threshold')

# combined = user_rating.merge(user_count, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')

# print('Number of unique books: ', combined['Book-Title'].nunique())
# print('Number of unique users: ', combined['User-ID'].nunique())




# scaler = MinMaxScaler()
# combined['Book-Rating'] = combined['Book-Rating'].values.astype(float)
# rating_scaled = pd.DataFrame(scaler.fit_transform(combined['Book-Rating'].values.reshape(-1,1)))
# combined['Book-Rating'] = rating_scaled


# combined = combined.drop_duplicates(['User-ID', 'Book-Title'])
# user_book_matrix = combined.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
# user_book_matrix.fillna(0, inplace=True)
# users = user_book_matrix.index.tolist()
# books = user_book_matrix.columns.tolist()
# user_book_matrix = user_book_matrix.as_matrix()


# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()



# num_input = combined['Book-Title'].nunique()
# num_hidden_1 = 10
# num_hidden_2 = 5

# X = tf.placeholder(tf.float64, [None, num_input])

# weights = {
#     'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
#     'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
#     'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
#     'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
# }

# biases = {
#     'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
#     'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
# }



# def encoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
#     return layer_2

# def decoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
#     return layer_2


# encoder_op = encoder(X)
# decoder_op = decoder(encoder_op)
# y_pred = decoder_op
# y_true = X


# loss = tf.losses.mean_squared_error(y_true, y_pred)
# optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
# eval_x = tf.placeholder(tf.int32, )
# eval_y = tf.placeholder(tf.int32, )
# pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

# init = tf.global_variables_initializer()
# local_init = tf.local_variables_initializer()
# pred_data = pd.DataFrame()






# with tf.Session() as session:
#     epochs = 100
#     batch_size = 35

#     session.run(init)
#     session.run(local_init)

#     num_batches = int(user_book_matrix.shape[0] / batch_size)
#     user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
#     for i in range(epochs):
#         avg_cost = 0
#         for batch in user_book_matrix:
#             _, l = session.run([optimizer, loss], feed_dict={X: batch})
#             avg_cost += l
#         avg_cost /= num_batches

#         print("epoch: {} Loss: {}".format(i + 1, avg_cost))

#     user_book_matrix = np.concatenate(user_book_matrix, axis=0)
#     preds = session.run(decoder_op, feed_dict={X: user_book_matrix})
#     pred_data = pred_data.append(pd.DataFrame(preds))

#     pred_data = pred_data.stack().reset_index(name='Book-Rating')
#     pred_data.columns = ['User-ID', 'Book-Title', 'Book-Rating']
#     pred_data['User-ID'] = pred_data['User-ID'].map(lambda value: users[value])
#     pred_data['Book-Title'] = pred_data['Book-Title'].map(lambda value: books[value])
    
#     keys = ['User-ID', 'Book-Title']
#     index_1 = pred_data.set_index(keys).index
#     index_2 = combined.set_index(keys).index

#     top_ten_ranked = pred_data[~index_1.isin(index_2)]
#     top_ten_ranked = top_ten_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
#     top_ten_ranked = top_ten_ranked.groupby('User-ID').head(10)



# top_ten_ranked.loc[top_ten_ranked['User-ID'] == 278582]

# book_rating.loc[book_rating['User-ID'] == 278582].sort_values(by=['Book-Rating'], ascending=False)

# Building a Content Based Recommender System for Hotels in Seattle
- [Articulo](https://towardsdatascience.com/building-a-content-based-recommender-system-for-hotels-in-seattle-d724f0a32070)
> Por lo tanto, nuestro conjunto de datos final contiene 3192 usuarios para 5850 libros. Y cada usuario ha otorgado al menos 20 calificaciones y cada libro ha recibido al menos 25 calificaciones. Si no tienes una GPU, este sería un buen tamaño.

In [4]:
# import pandas as pd
# import numpy as np
# from nltk.corpus import stopwords
# from sklearn.metrics.pairwise import linear_kernel
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# import re
# import random
# import plotly.graph_objs as go
# import plotly.plotly as py
# import cufflinks
# from IPython.core.interactiveshell import InteractiveShell
# import plotly.figure_factory as ff
# from plotly.offline import iplot


# pd.options.display.max_columns = 30
# InteractiveShell.ast_node_interactivity = 'all'
# cufflinks.go_offline()
# cufflinks.set_config_file(world_readable=True, theme='solar')
# df = pd.read_csv('Seattle_Hotels.csv', encoding="latin-1")
# df.head()
# print('We have ', len(df), 'hotels in the data')



# def print_description(index):
#     example = df[df.index == index][['desc', 'name']].values[0]
#     if len(example) > 0:
#         print(example[0])
#         print('Name:', example[1])


# print_description(10)
# print_description(100)



# def get_top_n_words(corpus, n=None):
#     vec = CountVectorizer().fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_words(df['desc'], 20)
# df1 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df1.groupby('desc').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description before removing stop words')




# def get_top_n_words(corpus, n=None):
#     vec = CountVectorizer(stop_words='english').fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_words(df['desc'], 20)
# df2 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df2.groupby('desc').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description after removing stop words')




# def get_top_n_bigram(corpus, n=None):
#     vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_bigram(df['desc'], 20)
# df3 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df3.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description before removing stop words')




# def get_top_n_bigram(corpus, n=None):
#     vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_bigram(df['desc'], 20)
# df4 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df4.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description After removing stop words')






# def get_top_n_trigram(corpus, n=None):
#     vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_trigram(df['desc'], 20)
# df5 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df5.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description before removing stop words')





# def get_top_n_trigram(corpus, n=None):
#     vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]
# common_words = get_top_n_trigram(df['desc'], 20)
# df6 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
# df6.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description after removing stop words')



# df['word_count'] = df['desc'].apply(lambda x: len(str(x).split()))
# desc_lengths = list(df['word_count'])
# print("Number of descriptions:",len(desc_lengths),
#       "\nAverage word count", np.average(desc_lengths),
#       "\nMinimum word count", min(desc_lengths),
#       "\nMaximum word count", max(desc_lengths))


# df['word_count'].iplot(
#     kind='hist',
#     bins = 50,
#     linecolor='black',
#     xTitle='word count',
#     yTitle='count',
#     title='Word Count Distribution in Hotel Description')


# REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# STOPWORDS = set(stopwords.words('english'))

# def clean_text(text):
#     """
#         text: a string
        
#         return: modified initial string
#     """
#     text = text.lower() # lowercase text
#     text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
#     text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#     text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
#     return text
    
# df['desc_clean'] = df['desc'].apply(clean_text)





# df.set_index('name', inplace = True)
# tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
# tfidf_matrix = tf.fit_transform(df['desc_clean'])
# cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# indices = pd.Series(df.index)

# def recommendations(name, cosine_similarities = cosine_similarities):
    
#     recommended_hotels = []
    
#     # gettin the index of the hotel that matches the name
#     idx = indices[indices == name].index[0]

#     # creating a Series with the similarity scores in descending order
#     score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

#     # getting the indexes of the 10 most similar hotels except itself
#     top_10_indexes = list(score_series.iloc[1:11].index)
    
#     # populating the list with the names of the top 10 matching hotels
#     for i in top_10_indexes:
#         recommended_hotels.append(list(df.index)[i])
        
#     return recommended_hotels



# recommendations('Hilton Seattle Airport & Conference Center')

In [None]:
# import sys
# import pandas as pd
# import numpy as np
# import scipy.sparse as sparse
# from scipy.sparse.linalg import spsolve
# import random
# from sklearn import metrics
# from sklearn.preprocessing import MinMaxScaler
# import implicit

# retail_df = pd.read_excel('data/Online Retail.xlsx')
# retail_df.info()





# retail_df = retail_df[retail_df['CustomerID'].notna()]
# grouped_df = retail_df[['CustomerID', 'StockCode', 'Description', 'Quantity']].groupby(['CustomerID', 'StockCode', 'Description']).sum().reset_index()
# grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
# grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]




# import plotly.express as px

# fig = px.histogram(grouped_df, x='Quantity', title='Distribution of the purchase quantity', nbins=500)
# fig.show();



# print(f'Number of unique customers: {grouped_df.CustomerID.nunique()}')
# print(f'Number of unique items: {grouped_df.StockCode.nunique()}')

# print(f'Average purchase quantity per interaction: {int(grouped_df.Quantity.mean())}')
# print(f'Minimum purchase quantity per interaction: {grouped_df.Quantity.min()}')
# print(f'Maximum purchase quantity per interaction: {grouped_df.Quantity.max()}')



# unique_customers = grouped_df.CustomerID.unique()
# customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))

# unique_items = grouped_df.StockCode.unique()
# item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

# grouped_df['customer_id'] = grouped_df.CustomerID.apply(lambda i: customer_ids[i])
# grouped_df['item_id'] = grouped_df.StockCode.apply(lambda i: item_ids[i])

# sparse_item_customer = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['item_id'], grouped_df['customer_id'])))
# sparse_customer_item = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['customer_id'], grouped_df['item_id'])))

# model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# alpha = 15
# data = (sparse_item_customer * alpha).astype('double')

# model.fit(data)





# grouped_df.loc[grouped_df['item_id'] == 1319].head()


# item_id = 1319
# n_similar = 10

# item_vecs = model.item_factors
# customer_vecs = model.user_factors

# item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

# scores = item_vecs.dot(item_vecs[item_id]) / item_norms
# top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
# similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

# for item in similar:
#     idx, score = item
#     print(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])




# def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):
    
#     customer_interactions = sparse_customer_item[customer_id,:].toarray()
#     customer_interactions = customer_interactions.reshape(-1) + 1
#     customer_interactions[customer_interactions > 1] = 0
    
#     rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()
    
#     min_max = MinMaxScaler()
#     rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
#     recommend_vector = customer_interactions * rec_vector_scaled

#     item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
#     descriptions = []
#     scores = []

#     for idx in item_idx:
#         descriptions.append(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])
#         scores.append(recommend_vector[idx])

#     recommendations = pd.DataFrame({'description': descriptions, 'score': scores})

#     return recommendations
    
# customer_vecs = sparse.csr_matrix(model.user_factors)
# item_vecs = sparse.csr_matrix(model.item_factors)
# # Create recommendations for customer with id 2
# customer_id = 2
# recommendations = recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs)

# print(recommendations)





# grouped_df.loc[grouped_df['customer_id'] == 2].sort_values('Quantity', ascending=False)[['customer_id', 'Description', 'Quantity']].head(20)

# Build Recommendation Systems: OpenAI’s Embeddings, Matrix Factorization and Deep Learning
- [Articulo](https://medium.com/@chenycy/build-recommendation-systems-openais-embeddings-matrix-factorization-and-deep-learning-0cac62008f0c)

In [6]:
# from sklearn.metrics.pairwise import linear_kernel
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# import numpy as np

# #Import data from the clean file 
# df = pd.read_csv('metadata_clean.csv')
# orig_df = pd.read_csv('movies_metadata.csv')
# df['overview'], df['id'] = orig_df['overview'], orig_df['id']
# df




# from openai import OpenAI
# import pandas as pd
# import numpy as np
# from dotenv import load_dotenv


# load_dotenv() 
# #load_dotenv() is a function that loads variables from a .env file into environment variables in a Python script. 
# # We store OPENAI_API_KEY = xxx in .env file

# client = OpenAI()

# MODEL_NAME = "text-embedding-ada-002"

# def get_embedding(text, model=MODEL_NAME):
#     if not isinstance(text, str):
#         text = str(text)
#     text = text.replace("\n", " ")
#     return client.embeddings.create(input=[text], model=model).data[0].embedding

# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# def search_docs(df, user_query, threshold=0.8):
#     embedding = get_embedding(user_query, model=MODEL_NAME)
#     df["similarities"] = df.embedding.apply(lambda x: cosine_similarity(x, embedding))
#     # Filter results based on the threshold
#     filtered_results = df[df["similarities"] > threshold]
#     return filtered_results


# df['embedding'] = df['overview'].apply(lambda x: get_embedding(x, model=MODEL_NAME))
# title = "Toy Story"
# description = df.loc[df["title"] == title, "overview"].iloc[0]
# result = search_docs(df, description, threshold=0.8)
# #remove the search item
# print(result[result["title"] != title]['title'])



# #Define a TF-IDF Vectorizer Object. Remove all english stopwords
# tfidf = TfidfVectorizer(stop_words='english')
# #Replace NaN with an empty string
# df['overview'] = df['overview'].fillna('')
# #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
# tfidf_matrix = tfidf.fit_transform(df['overview'])
# # Compute the cosine similarity matrix
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
# # Obtain the index of the movie that matches the title
#   idx = indices[title]
#   # Get the pairwsie similarity scores of all movies with that movie
#   # And convert it into a list of tuples as described above
#   sim_scores = list(enumerate(cosine_sim[idx]))
#   # Sort the movies based on the cosine similarity scores
#   sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#   # Get the scores of the 10 most similar movies. Ignore the first movie.
#   sim_scores = sim_scores[1:11]
#   # Get the movie indices
#   movie_indices = [i[0] for i in sim_scores]
#   # Return the top 10 most similar movies
#   return df['title'].iloc[movie_indices]

# #Get recommendations
# content_recommender('The Shawshank Redemption')





# from surprise import Dataset, Reader, KNNBasic
# from surprise.model_selection import cross_validate
# import os

# # path to dataset file
# file_path = os.path.expanduser("ml-1m/ratings.dat")

# # As we're loading a custom dataset, we need to define a reader. In the
# # movielens-100k dataset, each line has the following format:
# # 'user item rating timestamp', separated by '::' characters.
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# reader = Reader(line_format="user item rating timestamp", sep="::")

# data = Dataset.load_from_file(file_path, reader=reader)
# trainset = data.build_full_trainset()

# sim_options = {'name': 'cosine', 'user_based': True}
# knn_model = KNNBasic(sim_options=sim_options)

# knn_model.fit(trainset)
# user_id = str(196)  # Replace with the desired user ID

# # Get items that the user has not rated
# items_to_predict = [(user_id, iid, 4.0) for iid in trainset.all_items() if iid not in trainset.ur[trainset.to_inner_uid(user_id)]]
# # Get top N recommendations for the user
# top_n = knn_model.test(items_to_predict)[0:11]

# # Display the top N recommendations
# for uid, iid, true_r, est, _ in top_n:
#     print(f"User {uid} -> Item {iid} (Predicted rating: {est:.2f})")




# from surprise import Dataset, Reader, KNNBasic
# from surprise.model_selection import cross_validate
# import os

# # path to dataset file
# file_path = os.path.expanduser("ml-1m/ratings.dat")

# # As we're loading a custom dataset, we need to define a reader. In the
# # movielens-100k dataset, each line has the following format:
# # 'user item rating timestamp', separated by '::' characters.
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# reader = Reader(line_format="user item rating timestamp", sep="::")

# data = Dataset.load_from_file(file_path, reader=reader)
# trainset = data.build_full_trainset()

# sim_options = {'name': 'cosine', 'user_based': False}
# knn_model = KNNBasic(sim_options=sim_options)

# knn_model.fit(trainset)

# user_id = str(196)  # Replace with the desired user ID

# # Get items that the user has not rated
# items_to_predict = [(user_id, iid, 4.0) for iid in trainset.all_items() if iid not in trainset.ur[trainset.to_inner_uid(user_id)]]

# # Get top N recommendations for the user
# top_n = knn_model.test(items_to_predict)[0:11]

# # Display the top N recommendations
# for uid, iid, true_r, est, _ in top_n:
#     print(f"User {uid} -> Item {iid} (Predicted rating: {est:.2f})")







# from surprise import Dataset, Reader, SVD
# from surprise.model_selection import train_test_split
# from surprise.accuracy import rmse

# # path to dataset file
# file_path = os.path.expanduser("ml-1m/ratings.dat")

# # As we're loading a custom dataset, we need to define a reader. In the
# # movielens-100k dataset, each line has the following format:
# # 'user item rating timestamp', separated by '::' characters.
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# reader = Reader(line_format="user item rating timestamp", sep="::")

# data = Dataset.load_from_file(file_path, reader=reader)
# full_data = data.build_full_trainset()

# train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

# # Use SVD for item-based collaborative filtering
# svd_model = SVD()  # Set user_based to False for item-based collaborative filtering

# # Train the model on the training set
# svd_model.fit(train_set)

# # Make predictions on the test set
# predictions = svd_model.test(test_set)

# # Evaluate the model using RMSE
# accuracy = rmse(predictions)
# print(f"RMSE on the test set: {accuracy:.4f}")

# from collections import defaultdict
# def get_top_n(predictions, n=10):
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))
#     for uid, user_ratings in top_n.items():
#             user_ratings.sort(key=lambda x: x[1], reverse=True)
#             top_n[uid] = user_ratings[:n]
#     return top_n

# top_n = get_top_n(predictions, n=10)
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])





# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.layers import Input, Dense
# from tensorflow.keras.models import Model

# # Load the MovieLens dataset (download it from https://grouplens.org/datasets/movielens/)
# file_path = 'ml-1m/ratings.dat'
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# df = pd.read_csv(file_path, sep='\t', names=columns)

# # Create user-item interaction matrix
# user_item_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0).values

# # Split the data into training and testing sets
# train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

# # Build the autoencoder model
# num_users, num_items = user_item_matrix.shape
# latent_dim = 50

# input_layer = Input(shape=(num_items,))
# encoded = Dense(latent_dim, activation='relu')(input_layer)
# decoded = Dense(num_items, activation='sigmoid')(encoded)

# autoencoder = Model(inputs=input_layer, outputs=decoded)
# autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# # Train the autoencoder
# autoencoder.fit(train_data, train_data, epochs=10, batch_size=64, shuffle=True, validation_data=(test_data, test_data))

# # Extract user and item representations from the encoder part of the autoencoder
# encoder = Model(inputs=input_layer, outputs=encoded)
# user_embeddings = encoder.predict(user_item_matrix)

# # Example: Recommend items for a specific user
# user_id = 1  # Replace with the desired user ID
# user_representation = user_embeddings[user_id - 1]

# # Calculate the predicted ratings for all items
# predicted_ratings = np.dot(user_embeddings, user_representation)

# # Display top N recommendations
# top_n = np.argsort(predicted_ratings)[::-1][:10]
# print(f"Top recommendations for User {user_id}: {top_n + 1}")






# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense
# from tensorflow.keras.models import Model
# from tensorflow.keras.optimizers import Adam

# # Load the MovieLens dataset (download it from https://grouplens.org/datasets/movielens/)
# file_path = 'ml-1m/ratings.dat'
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# df = pd.read_csv(file_path, sep='\t', names=columns)

# # Create user-item interaction matrix
# user_item_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0).values

# # Binarize the ratings (0 if not rated, 1 if rated)
# user_item_matrix_binary = (user_item_matrix > 0).astype(float)

# # Split the data into training and testing sets
# train_data, test_data = train_test_split(user_item_matrix_binary, test_size=0.2, random_state=42)

# # RBM parameters
# num_visible = num_items = user_item_matrix_binary.shape[1]
# num_hidden = 50
# batch_size = 64
# epochs = 10

# # Build the RBM model
# visible_layer = Input(shape=(num_visible,))
# hidden_layer = Dense(num_hidden, activation='sigmoid')(visible_layer)
# visible_layer_reconstruction = Dense(num_visible, activation='sigmoid')(hidden_layer)

# rbm = Model(inputs=visible_layer, outputs=visible_layer_reconstruction)
# rbm.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# # Train the RBM
# rbm.fit(train_data, train_data, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(test_data, test_data))

# # Extract user and item representations from the hidden layer
# user_embeddings = rbm.layers[1].get_weights()[0].T
# item_embeddings = rbm.layers[1].get_weights()[0]

# # Example: Recommend items for a specific user
# user_id = 1  # Replace with the desired user ID
# user_representation = user_embeddings[user_id - 1]

# # Calculate the predicted ratings for all items
# predicted_ratings = np.dot(user_embeddings, user_representation)

# # Display top N recommendations
# top_n = np.argsort(predicted_ratings)[::-1][:10]
# print(f"Top recommendations for User {user_id}: {top_n + 1}")





# Collaborative Filtering Model with Tensorflow

In [None]:
# # The collaborative filter approach focuses on finding users who have given similar ratings to the same books, thus creating a link between users, to whom will be suggested books that were reviewed in a positive way.
# # In this way, we look for associations between users, not between books.

# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.preprocessing import MinMaxScaler




# rating = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# user = pd.read_csv('data/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book = pd.read_csv('data/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# book_rating = pd.merge(rating, book, on='ISBN')
# cols = ['Year-Of-Publication', 'Publisher', 'Book-Author', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
# book_rating.drop(cols, axis=1, inplace=True)
# book_rating.head()



# book_rating.head(3)



# rating_count = (book_rating.
#      groupby(by = ['Book-Title'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_book'})
#      [['Book-Title', 'RatingCount_book']]
#     )
# rating_count.head()





# threshold = 25
# rating_count = rating_count.query('RatingCount_book >= @threshold')
# rating_count.head()




# book_rating.head(3)


# user_rating = pd.merge(rating_count, book_rating, left_on='Book-Title', right_on='Book-Title', how='left')

# user_rating.head(3)



# user_count = (user_rating.
#      groupby(by = ['User-ID'])['Book-Rating'].
#      count().
#      reset_index().
#      rename(columns = {'Book-Rating': 'RatingCount_user'})
#      [['User-ID', 'RatingCount_user']]
#     )
# user_count.head()


# threshold = 20
# user_count = user_count.query('RatingCount_user >= @threshold')
# user_count.head()



# combined = user_rating.merge(user_count, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')



# combined.head(3)



# combined.shape


# print('Number of unique books: ', combined['Book-Title'].nunique())
# print('Number of unique users: ', combined['User-ID'].nunique())



# # Normalize the ratings.
# scaler = MinMaxScaler()
# combined['Book-Rating'] = combined['Book-Rating'].values.astype(float)
# rating_scaled = pd.DataFrame(scaler.fit_transform(combined['Book-Rating'].values.reshape(-1,1)))
# combined['Book-Rating'] = rating_scaled


# # Abd build the user book matrix.

# combined = combined.drop_duplicates(['User-ID', 'Book-Title'])
# user_book_matrix = combined.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
# user_book_matrix.fillna(0, inplace=True)

# users = user_book_matrix.index.tolist()
# books = user_book_matrix.columns.tolist()

# user_book_matrix = user_book_matrix.as_matrix()


# # tf.placeholder only available in v1, so we have to work around. 
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()


# # We will initialize the TensorFlow placeholder. Then, weights and biases are randomly initialized, the following code are taken from the book: Python Machine Learning Cook Book - Second Edition
# num_input = combined['Book-Title'].nunique()
# num_hidden_1 = 10
# num_hidden_2 = 5

# X = tf.placeholder(tf.float64, [None, num_input])

# weights = {
#     'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
#     'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
#     'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
#     'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
# }

# biases = {
#     'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
#     'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
#     'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
# }





# # Now, we can build the encoder and decoder model, as follows:
# def encoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
#     return layer_2

# def decoder(x):
#     layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
#     layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
#     return layer_2

# # We will construct the model and the predictions
# encoder_op = encoder(X)
# decoder_op = decoder(encoder_op)
# y_pred = decoder_op
# y_true = X



# # define loss function and optimizer, and minimize the squared error, and define the evaluation metrics
# loss = tf.losses.mean_squared_error(y_true, y_pred)
# optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
# eval_x = tf.placeholder(tf.int32, )
# eval_y = tf.placeholder(tf.int32, )
# pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

# # Initialize the variables. Because TensorFlow uses computational graphs for its operations, placeholders and variables must be initialized.
# init = tf.global_variables_initializer()
# local_init = tf.local_variables_initializer()
# pred_data = pd.DataFrame()

# # We can finally start to train our model.
# # We split training data into batches, and we feed the network with them.
# # We train our model with vectors of user ratings, each vector represents a user and each column a book, and entries are ratings that the user gave to books. 
# # After a few trials, I discovered that training model for 5 epochs with a batch size of 10 would be consum enough memory. This means that the entire training set will feed our neural network 20 times, every time using 50 users.

# with tf.Session() as session:
#     epochs = 100
#     batch_size = 35

#     session.run(init)
#     session.run(local_init)

#     num_batches = int(user_book_matrix.shape[0] / batch_size)
#     user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
#     for i in range(epochs):

#         avg_cost = 0
#         for batch in user_book_matrix:
#             _, l = session.run([optimizer, loss], feed_dict={X: batch})
#             avg_cost += l

#         avg_cost /= num_batches

#         print("epoch: {} Loss: {}".format(i + 1, avg_cost))

#     user_book_matrix = np.concatenate(user_book_matrix, axis=0)

#     preds = session.run(decoder_op, feed_dict={X: user_book_matrix})

#     pred_data = pred_data.append(pd.DataFrame(preds))

#     pred_data = pred_data.stack().reset_index(name='Book-Rating')
#     pred_data.columns = ['User-ID', 'Book-Title', 'Book-Rating']
#     pred_data['User-ID'] = pred_data['User-ID'].map(lambda value: users[value])
#     pred_data['Book-Title'] = pred_data['Book-Title'].map(lambda value: books[value])
    
#     keys = ['User-ID', 'Book-Title']
#     index_1 = pred_data.set_index(keys).index
#     index_2 = combined.set_index(keys).index

#     top_ten_ranked = pred_data[~index_1.isin(index_2)]
#     top_ten_ranked = top_ten_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
#     top_ten_ranked = top_ten_ranked.groupby('User-ID').head(10)


# top_ten_ranked.loc[top_ten_ranked['User-ID'] == 278582]
# book_rating.loc[book_rating['User-ID'] == 278582].sort_values(by=['Book-Rating'], ascending=False)

# Chatgpt

In [None]:
# import pandas as pd

# # EJERCICIOS: Datos de los ejercicios
# ejercicios_data = pd.DataFrame({
#     'oid': [1, 2, 3, 4],
#     'nombre': ['Ejercicio 1', 'Ejercicio 2', 'Ejercicio 3', 'Ejercicio 4'],
#     'h1': [1, 0, 1, 0],
#     'h2': [0, 1, 0, 1],
#     'h3': [0, 0, 1, 1],
#     'h4': [1, 1, 0, 0],
#     's1': [1, 0, 1, 0],
#     's2': [0, 1, 0, 1],
#     's3': [1, 1, 0, 0],
#     's4': [0, 0, 1, 1],
#     'skill': [3, 2, 5, 1],
#     'knowledge': [4, 3, 6, 2],
#     'complexity': [7, 5, 11, 3],  # Ejemplo de complejidad combinada de skill + knowledge
# })

# # EJERCICIOS REALIZADOS: Datos de ejercicios completados por los estudiantes
# ejercicios_realizados_data = pd.DataFrame({
#     'rut': [101, 102, 103],
#     'e0': [1, 0, 1],
#     'e1': [0, 1, 0],
#     'e2': [1, 0, 1],
#     'e3': [0, 1, 0]
# })

# # USUARIOS: Datos de los estudiantes
# usuarios_data = pd.DataFrame({
#     'rut': [101, 102, 103],
#     'programa': ['Ingeniería', 'Ciencias', 'Matemáticas'],
#     'hito1': [3, 4, 5],
#     'hito2': [5, 3, 2],
#     'exitosos': [10, 12, 8],
#     'fallidos': [2, 1, 3]
# })


# import tensorflow as tf
# from tensorflow.keras import layers

# # Dimensiones de los embeddings
# embedding_dim = 8

# # Red para los estudiantes
# def build_student_model():
#     # Entradas
#     student_rut_input = layers.Input(shape=(1,), name="student_rut")
#     student_programa_input = layers.Input(shape=(1,), name="student_programa")
#     student_hito_input = layers.Input(shape=(4,), name="student_hitos")
    
#     # Embedding para el ID del estudiante
#     student_embedding = layers.Embedding(input_dim=1000, output_dim=embedding_dim)(student_rut_input)
#     student_flatten = layers.Flatten()(student_embedding)
    
#     # Embedding para el programa académico
#     programa_embedding = layers.Embedding(input_dim=10, output_dim=embedding_dim)(student_programa_input)
#     programa_flatten = layers.Flatten()(programa_embedding)
    
#     # Concatenar todas las entradas
#     student_concat = layers.Concatenate()([student_flatten, programa_flatten, student_hito_input])
    
#     # Red completamente conectada
#     student_dense = layers.Dense(64, activation='relu')(student_concat)
#     student_dense = layers.Dense(embedding_dim)(student_dense)
    
#     return tf.keras.Model([student_rut_input, student_programa_input, student_hito_input], student_dense)

# student_model = build_student_model()
# student_model.summary()


# # Red para los ejercicios
# def build_exercise_model():
#     # Entradas
#     exercise_id_input = layers.Input(shape=(1,), name="exercise_id")
#     exercise_hito_input = layers.Input(shape=(4,), name="exercise_hito")
#     exercise_skill_input = layers.Input(shape=(4,), name="exercise_skill")
    
#     # Embedding para el ID del ejercicio
#     exercise_embedding = layers.Embedding(input_dim=1000, output_dim=embedding_dim)(exercise_id_input)
#     exercise_flatten = layers.Flatten()(exercise_embedding)
    
#     # Concatenar todas las entradas
#     exercise_concat = layers.Concatenate()([exercise_flatten, exercise_hito_input, exercise_skill_input])
    
#     # Red completamente conectada
#     exercise_dense = layers.Dense(64, activation='relu')(exercise_concat)
#     exercise_dense = layers.Dense(embedding_dim)(exercise_dense)
    
#     return tf.keras.Model([exercise_id_input, exercise_hito_input, exercise_skill_input], exercise_dense)

# exercise_model = build_exercise_model()
# exercise_model.summary()



# # Cálculo de similitud entre estudiantes y ejercicios
# student_embeddings = student_model([student_rut_input, student_programa_input, student_hito_input])
# exercise_embeddings = exercise_model([exercise_id_input, exercise_hito_input, exercise_skill_input])

# # Producto punto
# dot_product = tf.reduce_sum(tf.multiply(student_embeddings, exercise_embeddings), axis=1)

# # Función de similitud
# similarity = layers.Activation('sigmoid')(dot_product)

# # Modelo final para entrenar
# recommendation_model = tf.keras.Model(
#     inputs=[student_rut_input, student_programa_input, student_hito_input, 
#             exercise_id_input, exercise_hito_input, exercise_skill_input],
#     outputs=similarity
# )

# recommendation_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# recommendation_model.summary()



# # Simulación de etiquetas de ejercicios completados (0 o 1)
# y_train = ejercicios_realizados_data.values[:, 1:]

# # Entrenamiento del modelo
# recommendation_model.fit(
#     [students_rut_train, students_programa_train, students_hitos_train,
#      exercises_id_train, exercises_hito_train, exercises_skill_train],
#     y_train,
#     epochs=10,
#     batch_size=32
# )


# # Predicción
# predictions = recommendation_model.predict([student_input, exercise_input])
# top_k_recommendations = tf.argsort(predictions, direction='DESCENDING')[:5]



# import numpy as np
# import pandas as pd

# # EJERCICIOS REALIZADOS: Datos de ejercicios completados por los estudiantes
# ejercicios_realizados_data = pd.DataFrame({
#     'rut': [101, 102, 103],
#     'e0': [1, 0, 1],  # 1 si el ejercicio fue realizado, 0 si no
#     'e1': [0, 1, 0],
#     'e2': [1, 0, 1],
#     'e3': [0, 1, 0]
# })

# # Convertimos los datos en una matriz numpy
# interactions_matrix = ejercicios_realizados_data.drop(columns=['rut']).values
# n_students, n_exercises = interactions_matrix.shape



# import tensorflow as tf
# from tensorflow.keras import layers, Model

# # Definir el número de factores latentes (dimensión de los embeddings)
# latent_dim = 8

# # Definir entradas del modelo
# student_input = layers.Input(shape=(1,), name='student_input')
# exercise_input = layers.Input(shape=(1,), name='exercise_input')

# # Crear embeddings para estudiantes y ejercicios
# student_embedding = layers.Embedding(input_dim=n_students, output_dim=latent_dim, name='student_embedding')(student_input)
# exercise_embedding = layers.Embedding(input_dim=n_exercises, output_dim=latent_dim, name='exercise_embedding')(exercise_input)

# # Aplanar las representaciones (embeddings)
# student_vec = layers.Flatten()(student_embedding)
# exercise_vec = layers.Flatten()(exercise_embedding)

# # Producto punto entre los embeddings de estudiantes y ejercicios
# dot_product = layers.Dot(axes=1)([student_vec, exercise_vec])

# # Definir el modelo completo
# model = Model(inputs=[student_input, exercise_input], outputs=dot_product)

# # Compilar el modelo
# model.compile(optimizer='adam', loss='mean_squared_error')
# model.summary()


# # Crear los datos de entrada para el modelo
# students, exercises = np.where(~np.isnan(interactions_matrix))  # IDs de estudiantes y ejercicios
# labels = interactions_matrix[students, exercises]  # 1 si completado, 0 si no completado

# # Entrenar el modelo
# model.fit([students, exercises], labels, epochs=10, batch_size=16)



# # Ejemplo: Predecir los puntajes para un estudiante
# student_id = 0  # Estudiante con ID 0
# exercises_not_done = np.where(interactions_matrix[student_id] == 0)[0]  # Ejercicios no realizados por el estudiante

# # Predecir el puntaje para cada ejercicio no realizado
# predicted_scores = model.predict([np.array([student_id] * len(exercises_not_done)), exercises_not_done])

# # Ordenar las predicciones para recomendar los mejores ejercicios
# top_k_recommendations = exercises_not_done[np.argsort(predicted_scores, axis=0)[::-1][:5]]
# print(f"Top 5 ejercicios recomendados para el estudiante {student_id}: {top_k_recommendations}")



# # Ejemplo: Predecir los puntajes para un estudiante
# student_id = 0  # Estudiante con ID 0
# exercises_not_done = np.where(interactions_matrix[student_id] == 0)[0]  # Ejercicios no realizados por el estudiante

# # Predecir el puntaje para cada ejercicio no realizado
# predicted_scores = model.predict([np.array([student_id] * len(exercises_not_done)), exercises_not_done])

# # Ordenar las predicciones para recomendar los mejores ejercicios
# top_k_recommendations = exercises_not_done[np.argsort(predicted_scores, axis=0)[::-1][:5]]
# print(f"Top 5 ejercicios recomendados para el estudiante {student_id}: {top_k_recommendations}")




# # Agregar características adicionales al modelo
# exercise_hito_input = layers.Input(shape=(4,), name='exercise_hito_input')  # hito binario de los ejercicios
# exercise_skill_input = layers.Input(shape=(4,), name='exercise_skill_input')  # skill binario de los ejercicios

# # Concatenar las características adicionales con los embeddings
# exercise_concat = layers.Concatenate()([exercise_vec, exercise_hito_input, exercise_skill_input])

# # Modificar el modelo para incluir las nuevas entradas
# new_dot_product = layers.Dense(1)(exercise_concat)
# new_model = Model(inputs=[student_input, exercise_input, exercise_hito_input, exercise_skill_input], outputs=new_dot_product)

# # Compilar y entrenar el modelo
# new_model.compile(optimizer='adam', loss='mean_squared_error')




# import pandas as pd
# import numpy as np

# # Datos de la prueba diagnóstica
# diagnostico_data = pd.DataFrame({
#     'rut': [101, 102, 103],
#     'score': [78, 65, 89],  # Puntaje total
#     'score_a': [23, 15, 28],  # Puntaje abstracción
#     'core_p': [20, 18, 22],  # Puntaje reconocimiento de patrones
#     'score_d': [19, 14, 25],  # Puntaje descomposición
#     'score_s': [16, 18, 14]   # Puntaje algoritmos
# })

# # Datos de los ejercicios con su complejidad
# ejercicios_data = pd.DataFrame({
#     'oid': [0, 1, 2, 3],
#     'complexity': [60, 80, 50, 90],  # Complejidad de los ejercicios
#     'hito': [1, 2, 1, 3],  # Hito asociado al ejercicio
#     'skill': [2, 3, 2, 4],  # Nivel de habilidad requerido
#     'knowledge': [3, 4, 3, 5]  # Nivel de conocimiento requerido
# })


# from sklearn.metrics.pairwise import cosine_similarity

# # Normalizar los puntajes de la prueba diagnóstica y las complejidades de los ejercicios
# diagnostico_scores = diagnostico_data[['score_a', 'core_p', 'score_d', 'score_s']].values
# ejercicio_complexity = ejercicios_data[['skill', 'knowledge']].values

# # Calcular la similaridad entre estudiantes y ejercicios basándose en los puntajes
# similarity_matrix = cosine_similarity(diagnostico_scores, ejercicio_complexity)

# # Crear recomendaciones iniciales para cada estudiante
# def recomendar_ejercicios(similarity_matrix, top_k=3):
#     for student_idx, similarities in enumerate(similarity_matrix):
#         recommended_exercises = np.argsort(similarities)[-top_k:][::-1]
#         print(f"Recomendaciones para estudiante {diagnostico_data['rut'][student_idx]}: {recommended_exercises}")

# recomendar_ejercicios(similarity_matrix)



# import tensorflow as tf
# from tensorflow.keras import layers, Model

# # Input para el estudiante (puntajes diagnósticos)
# diagnostic_input = layers.Input(shape=(4,), name='diagnostic_input')

# # Input para el ejercicio (complejidad del ejercicio: skill + knowledge)
# exercise_input = layers.Input(shape=(2,), name='exercise_input')

# # Concatenar las entradas
# combined_input = layers.Concatenate()([diagnostic_input, exercise_input])

# # Capa oculta
# hidden = layers.Dense(16, activation='relu')(combined_input)
# hidden = layers.Dense(8, activation='relu')(hidden)

# # Capa de salida: Probabilidad de completar exitosamente el ejercicio
# output = layers.Dense(1, activation='sigmoid')(hidden)

# # Definir el modelo
# model = Model(inputs=[diagnostic_input, exercise_input], outputs=output)

# # Compilar el modelo
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

# Building Recommender System with Surprise

In [None]:
# import pandas as pd
# from surprise import Reader
# from surprise import Dataset
# from surprise.model_selection import cross_validate
# from surprise import NormalPredictor
# from surprise import KNNBasic
# from surprise import KNNWithMeans
# from surprise import KNNWithZScore
# from surprise import KNNBaseline
# from surprise import SVD
# from surprise import BaselineOnly
# from surprise import SVDpp
# from surprise import NMF
# from surprise import SlopeOne
# from surprise import CoClustering
# from surprise.accuracy import rmse
# from surprise import accuracy
# from surprise.model_selection import train_test_split




# user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# user.columns = ['userID', 'Location', 'Age']
# rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
# rating.columns = ['userID', 'ISBN', 'bookRating']




# user.head()
# rating.head()


# df = pd.merge(user, rating, on='userID', how='inner')
# df.drop(['Location', 'Age'], axis=1, inplace=True)


# df.head()
# df.shape
# df.info()


# print('Dataset shape: {}'.format(df.shape))
# print('-Dataset examples-')
# print(df.iloc[::200000, :])



# from plotly.offline import init_notebook_mode, plot, iplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)

# data = df['bookRating'].value_counts().sort_index(ascending=False)
# trace = go.Bar(x = data.index,
#                text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
#                textposition = 'auto',
#                textfont = dict(color = '#000000'),
#                y = data.values,
#                )
# # Create layout
# layout = dict(title = 'Distribution Of {} book-ratings'.format(df.shape[0]),
#               xaxis = dict(title = 'Rating'),
#               yaxis = dict(title = 'Count'))
# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)



# # Number of ratings per book
# data = df.groupby('ISBN')['bookRating'].count().clip(upper=50)

# # Create trace
# trace = go.Histogram(x = data.values,
#                      name = 'Ratings',
#                      xbins = dict(start = 0,
#                                   end = 50,
#                                   size = 2))
# # Create layout
# layout = go.Layout(title = 'Distribution Of Number of Ratings Per Book (Clipped at 50)',
#                    xaxis = dict(title = 'Number of Ratings Per Book'),
#                    yaxis = dict(title = 'Count'),
#                    bargap = 0.2)

# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)


# df.groupby('ISBN')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]



# # Number of ratings per user
# data = df.groupby('userID')['bookRating'].count().clip(upper=50)

# # Create trace
# trace = go.Histogram(x = data.values,
#                      name = 'Ratings',
#                      xbins = dict(start = 0,
#                                   end = 50,
#                                   size = 2))
# # Create layout
# layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
#                    xaxis = dict(title = 'Ratings Per User'),
#                    yaxis = dict(title = 'Count'),
#                    bargap = 0.2)

# # Create plot
# fig = go.Figure(data=[trace], layout=layout)
# iplot(fig)



# df.groupby('userID')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]


# min_book_ratings = 50
# filter_books = df['ISBN'].value_counts() > min_book_ratings
# filter_books = filter_books[filter_books].index.tolist()

# min_user_ratings = 50
# filter_users = df['userID'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# df_new = df[(df['ISBN'].isin(filter_books)) & (df['userID'].isin(filter_users))]
# print('The original data frame shape:\t{}'.format(df.shape))
# print('The new data frame shape:\t{}'.format(df_new.shape))



# reader = Reader(rating_scale=(0, 9))
# data = Dataset.load_from_df(df_new[['userID', 'ISBN', 'bookRating']], reader)


# benchmark = []
# # Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)



# surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
# surprise_results



# print('Using ALS')
# bsl_options = {'method': 'als',
#                'n_epochs': 5,
#                'reg_u': 12,
#                'reg_i': 5
#                }
# algo = BaselineOnly(bsl_options=bsl_options)
# cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)




# trainset, testset = train_test_split(data, test_size=0.25)
# algo = BaselineOnly(bsl_options=bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)




# trainset = algo.trainset
# print(algo.__class__.__name__)


# def get_Iu(uid):
#     """ return the number of items rated by given user
#     args: 
#       uid: the id of the user
#     returns: 
#       the number of items rated by the user
#     """
#     try:
#         return len(trainset.ur[trainset.to_inner_uid(uid)])
#     except ValueError: # user was not part of the trainset
#         return 0
    
# def get_Ui(iid):
#     """ return number of users that have rated given item
#     args:
#       iid: the raw id of the item
#     returns:
#       the number of users that have rated the item.
#     """
#     try: 
#         return len(trainset.ir[trainset.to_inner_iid(iid)])
#     except ValueError:
#         return 0
    
# df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
# df['Iu'] = df.uid.apply(get_Iu)
# df['Ui'] = df.iid.apply(get_Ui)
# df['err'] = abs(df.est - df.rui)



# df.head()
# best_predictions = df.sort_values(by='err')[:10]
# worst_predictions = df.sort_values(by='err')[-10:]
# best_predictions
# worst_predictions

# df_new.loc[df_new['ISBN'] == '055358264X']['bookRating'].describe()



# ONLINE RETAIL IMPLICIT

In [None]:
# import sys
# import pandas as pd
# import numpy as np
# import scipy.sparse as sparse
# from scipy.sparse.linalg import spsolve
# import random
# from sklearn import metrics

# from sklearn.preprocessing import MinMaxScaler

# import implicit


# retail_df = pd.read_excel('data/Online Retail.xlsx')
# retail_df.head()
# retail_df.info()


# retail_df = retail_df[retail_df['CustomerID'].notna()]
# retail_df.info()


# grouped_df = retail_df[['CustomerID', 'StockCode', 'Description', 'Quantity']].groupby(['CustomerID', 'StockCode', 'Description']).sum().reset_index()
# grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
# grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]


# grouped_df.head()
# grouped_df.Quantity.describe()


# import plotly.express as px

# fig = px.histogram(grouped_df, x='Quantity', title='Distribution of the purchase quantity', nbins=500)
# fig.show();






# print(f'Number of unique customers: {grouped_df.CustomerID.nunique()}')
# print(f'Number of unique items: {grouped_df.StockCode.nunique()}')

# print(f'Average purchase quantity per interaction: {int(grouped_df.Quantity.mean())}')
# print(f'Minimum purchase quantity per interaction: {grouped_df.Quantity.min()}')
# print(f'Maximum purchase quantity per interaction: {grouped_df.Quantity.max()}')


# unique_customers = grouped_df.CustomerID.unique()
# customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))

# unique_items = grouped_df.StockCode.unique()
# item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

# grouped_df['customer_id'] = grouped_df.CustomerID.apply(lambda i: customer_ids[i])
# grouped_df['item_id'] = grouped_df.StockCode.apply(lambda i: item_ids[i])

# grouped_df.head()


# sparse_item_customer = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['item_id'], grouped_df['customer_id'])))
# sparse_customer_item = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['customer_id'], grouped_df['item_id'])))






# sparse_item_customer
# sparse_customer_item
# model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# alpha = 15
# data = (sparse_item_customer * alpha).astype('double')

# model.fit(data)
# grouped_df.loc[grouped_df['item_id'] == 1319].head()

# item_id = 1319
# n_similar = 10

# item_vecs = model.item_factors
# customer_vecs = model.user_factors

# item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

# scores = item_vecs.dot(item_vecs[item_id]) / item_norms
# top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
# similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])


# for item in similar:
#     idx, score = item
#     print(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])



# def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):
    
#     customer_interactions = sparse_customer_item[customer_id,:].toarray()
#     customer_interactions = customer_interactions.reshape(-1) + 1
#     customer_interactions[customer_interactions > 1] = 0
    
#     rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()
    
#     min_max = MinMaxScaler()
#     rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
#     recommend_vector = customer_interactions * rec_vector_scaled

#     item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
#     descriptions = []
#     scores = []

#     for idx in item_idx:
#         descriptions.append(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])
#         scores.append(recommend_vector[idx])

#     recommendations = pd.DataFrame({'description': descriptions, 'score': scores})

#     return recommendations


# customer_vecs = sparse.csr_matrix(model.user_factors)
# item_vecs = sparse.csr_matrix(model.item_factors)

# # Create recommendations for customer with id 2
# customer_id = 2

# recommendations = recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs)

# print(recommendations)

# grouped_df.loc[grouped_df['customer_id'] == 2].sort_values('Quantity', ascending=False)[['customer_id', 'Description', 'Quantity']].head(20)







# import random

# def make_train(ratings, pct_test = 0.2):
#     test_set = ratings.copy() # Make a copy of the original set to be the test set. 
#     test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
#     training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
#     nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
#     nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
#     random.seed(0) # Set the random seed to zero for reproducibility
    
#     num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
#     samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

#     item_inds = [index[0] for index in samples] # Get the item row indices

#     customer_inds = [index[1] for index in samples] # Get the user column indices

    
#     training_set[item_inds, customer_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
#     training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
#     return training_set, test_set, list(set(customer_inds))



# item_train, item_test, item_customers_altered = make_train(sparse_item_customer, pct_test = 0.2)


# def auc_score(predictions, test):
#     fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
#     return metrics.auc(fpr, tpr)


# def calc_mean_auc(training_set, altered_customers, predictions, test_set):
#     store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
#     popularity_auc = [] # To store popular AUC scores
#     pop_items = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
#     item_vecs = predictions[1]
#     for customer in altered_customers: # Iterate through each user that had an item altered
#         training_column = training_set[:,customer].toarray().reshape(-1) # Get the training set column
#         zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
#         # Get the predicted values based on our user/item vectors
#         customer_vec = predictions[0][customer,:]
#         pred = customer_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
#         # Get only the items that were originally zero
#         # Select all ratings from the MF prediction for this user that originally had no iteraction
#         actual = test_set[:,customer].toarray()[zero_inds,0].reshape(-1)
        
#         # Select the binarized yes/no interaction pairs from the original full data
#         # that align with the same pairs in training 
#         pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
#         store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
#         popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
#     # End users iteration
    
#     return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))


# calc_mean_auc(item_train, item_customers_altered,
#               [customer_vecs, item_vecs.T], item_test)