In [1]:
import pandas as pd
import numpy as np
import os
import gc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler

from evaluation import evaluate_solution
from scipy.sparse import csr_matrix, save_npz


# LightFM
from lightfm import LightFM
from lightfm.data import Dataset as lfmDataset 
from lightfm.evaluation import precision_at_k

from sklearn.preprocessing import OneHotEncoder



## Load the Data

In [2]:
ratings = pd.read_csv("data/BookRatings.csv")
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,99,316748641,7
1,99,446677450,10
2,99,553347594,9
3,99,451166892,3
4,99,671621009,10


In [9]:
items_info = pd.read_csv("data/BooksMetaInfo.csv")
items_info.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"['Mark P. O. Morford', 'Robert J. Lenardon']",Provides an introduction to classical myths pl...,808.0,['Social Science']
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,['Richard Bruce Wright'],"In a small town in Canada, Clara Callan reluct...",414.0,['Actresses']
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,"[""Carlo D'Este""]","Here, for the first time in paperback, is an o...",555.0,['1940-1949']
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,['Gina Bari Kolata'],"Describes the great flu epidemic of 1918, an o...",330.0,['Medical']
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,['E. J. W. Barber'],A look at the incredibly well-preserved ancien...,240.0,['Design']


In [3]:
users_info = pd.read_csv("data/BooksUsers.csv")
users_info.head()

Unnamed: 0,User-ID,Location,Age
0,2,"stockton, california, usa",18.0
1,8,"timmins, ontario, canada",
2,9,"germantown, tennessee, usa",
3,10,"albacete, wisconsin, spain",26.0
4,12,"fort bragg, california, usa",


In [4]:
test_users = pd.read_csv("data/test_users.csv")
test_users

Unnamed: 0,User-ID
0,114
1,507
2,850
3,3346
4,4092
...,...
584,276681
585,276847
586,277901
587,278137


## Process and clean data

In [5]:
def check_for_nans(df):
    return(np.sum(df.isnull()))

In [6]:
def describe_ratings_data(df, user_col, item_col, rating_col):
    # How many ratings do we have in total?
    print(f"We have {len(df):,} ratings in total.")
    # How many items were rated?
    print(f"We have {df[item_col].unique().size:,} items rated.")
    # How many users rated at least one book?
    print(f"We have {df[user_col].unique().size:,} users that rated at least one item.")

In [7]:
check_for_nans(ratings)

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [10]:
check_for_nans(items_info)

ISBN                       0
Book-Title                 0
Book-Author                0
Year-Of-Publication        0
Publisher                  0
Image-URL-S                0
Image-URL-M                0
Image-URL-L                2
authors                    0
description            14815
pageCount              15197
categories             16914
dtype: int64

In [11]:
check_for_nans(users_info)

User-ID         0
Location        0
Age         24605
dtype: int64

In [12]:
user_col = "User-ID"
item_col = "ISBN"
rating_col = "Book-Rating"

describe_ratings_data(ratings, user_col, item_col, rating_col)

We have 109,209 ratings in total.
We have 47,768 items rated.
We have 5,719 users that rated at least one item.


In [13]:
ratings.describe()

Unnamed: 0,User-ID,Book-Rating
count,109209.0,109209.0
mean,132987.052532,7.734134
std,80051.009272,1.835269
min,99.0,1.0
25%,64946.0,7.0
50%,128835.0,8.0
75%,201768.0,9.0
max,278851.0,10.0


In [14]:
k_top=10

## Identify and separate the Users
- Which users are present in the training data?
- Make sure that you identify which test users are present in the training data and which are not.
- Can you use personalized methologies for all users?

In [15]:
print(f"{np.sum([user in ratings[user_col].unique() for user in test_users[user_col]])} users are present in training data")

489 users are present in training data


## Split train and validation data

In [16]:
# Crate validation set
data_train, data_val = train_test_split(ratings, test_size=0.4, random_state=42)

In [17]:
describe_ratings_data(data_train, user_col, item_col, rating_col)

We have 65,525 ratings in total.
We have 33,509 items rated.
We have 5,687 users that rated at least one item.


In [18]:
describe_ratings_data(data_val, user_col, item_col, rating_col)

We have 43,684 ratings in total.
We have 24,645 items rated.
We have 5,535 users that rated at least one item.


In [19]:
#Select reviews from users with at least k positive ratings.
def select_frequent_reviewers(df: pd.DataFrame, min_nr_reviews: int, min_rating: int, user_col: str, item_col: str, rating_col: str):
    """
    Select reviews from users with at least min_nr_reviews reviews with rating larger than min_rating.
    """
    
    # Select only positive reviews
    df_positive = df.copy().loc[df[rating_col] >= min_rating]

    # Select users with more than min_nr_reviews positive reviews
    user_review_count = df_positive.groupby(by=[user_col])[item_col].count()
    test_users_list = list(user_review_count[user_review_count > min_nr_reviews].index)

    # Select ratings from users specified above
    df_restrict = df_positive.copy().loc[df_positive[user_col].isin(test_users_list)]
    
    return df_restrict



In [20]:
data_val_final = select_frequent_reviewers(data_val, min_nr_reviews=k_top, min_rating=6, user_col="User-ID", item_col="ISBN", rating_col="Book-Rating")
data_val_final.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
77379,189334,891457461,10
48418,112001,440132789,10
39923,97324,679774386,8
31698,76499,321043707,10
82275,203240,590404989,10


In [21]:
val_users = list(data_val_final[user_col].unique())
train_users = list(data_train[user_col].unique())

val_users_in_train = np.sum([user in train_users for user in val_users])
print(f"Validation set has {len(val_users)} users, {val_users_in_train} of those users are also in the train set")

Validation set has 781 users, 781 of those users are also in the train set


In [21]:
#Create the validation recommendations
# nr of recommendations per user

def top_items_per_user(df: pd.DataFrame, user_col: str, item_col:str, rating_col:str, k_top: int = 10):
    df_ = df.copy()
    df_ = df_.set_index(item_col)
    df_users_kbest = df_.groupby(by=[user_col])[rating_col].nlargest(k_top).reset_index()
    df_users_kbest['rank'] = df_users_kbest.groupby(by=[user_col])[rating_col].rank(method="first")
    #df_users_kbest['rank'] = df_users_kbest['rank'].astype(int) - 1
    df_recommendations = df_users_kbest.pivot(index=user_col, columns="rank", values=item_col)
    df_recommendations = df_recommendations.reset_index(drop=False)
    df_recommendations.columns = np.arange(len(df_recommendations.columns))
    return df_recommendations



In [22]:
val_recommendations = top_items_per_user(data_val_final, user_col, item_col, rating_col, k_top=k_top)
val_recommendations.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,638,679442790,0316776963,316769487,316969443,316666343,0743406176,679410430,446531332,0446527793,446523569
1,1424,142001430,031215125X,451526988,743431014,618446877,0767902521,99771519,61015725,033390785X,140255087
2,1733,553107232,0373764146,553562746,373226934,373218400,051513287X,440168724,373240228,059035342X,345447840
3,1903,316441791,0151104212,345409469,394743644,380810336,0812550706,688164080,345333152,0679430946,1573922110
4,2030,62513982,0553255959,1561707236,62513346,1561707244,1563054671,809237601,385484100,0385315287,62508482


In [23]:
def save_recommendations(df: pd.DataFrame, file_name: str):
    """
    Save recommendation dataframe as .csv.
    """
    
    file_path = os.path.join("data", f"{file_name}.csv")
    df.to_csv(file_path, index=False, header=False)
    print(f"Recommendations were saved on file {file_name}.csv.")
    
save_recommendations(val_recommendations, "validation_recommendations")

Recommendations were saved on file validation_recommendations.csv.


## Non-Personalized Recommendations
- Create non-personalized recommendations as a baseline.
- Apply the recommendations to the test users.
- Store results in the required format for submission.
- Submit baseline recommendations.

In [24]:
def non_pers_reco_order(data: pd.DataFrame,
                        item_col: str,
                        rating_col:str,
                        k_top: int = 10,
                        aggregation: list() = ["mean", "count"]):
    """
    Create an ordered list of non-personalized recommendations, from best rated to worst rated.
    """
    non_pers_ratings = data.groupby(by=[item_col])[[rating_col]].agg(aggregation)
    non_pers_ratings.columns = non_pers_ratings.columns.get_level_values(1)
    
    #The resulting column names might be different than the specified with the aggregation parameter.
    try:
        non_pers_ratings = non_pers_ratings.sort_values(by=aggregation, ascending=False).head(k_top)
    except KeyError as e:
        print(e)
        print("Check if aggregation argument results in valid column names.")
        print(f"aggregation = {aggregation}\nrating columns = {non_pers_ratings.columns}")
        raise e
        
    non_pers_reco_list = non_pers_ratings.index.to_list()
    return non_pers_reco_list


In [25]:
non_pers_recommendations = non_pers_reco_order(data_train, item_col, rating_col, k_top=k_top)
print(non_pers_recommendations)

['0060256656', '0836217241', '0394800796', '014034019X', '0141301066', '0375727191', '0671729454', '0002251760', '0044409494', '0060256664']


In [26]:
def non_pers_reco_output(user_id_list:list, non_pers_reco_list:list):
    """
    Creates a non-personalized recommendation dataframe for specified users.
    """
    nr_test_users = len(user_id_list)
    user_id_df = pd.DataFrame(user_id_list, columns = ["user_id"], dtype = int)
    non_pers_reco_repeated =  pd.DataFrame(pd.DataFrame(non_pers_reco_list).T.values.repeat(nr_test_users, axis=0))
    non_pers_reco_output = pd.concat([user_id_df, non_pers_reco_repeated], axis=1)
    
    # Reset columns numbering. Useful later.
    #non_pers_reco_output.columns = np.arange(len(non_pers_reco_output.columns))
    
    return non_pers_reco_output

In [27]:
non_pers_reco_solution_val = non_pers_reco_output(val_users, non_pers_recommendations)
save_recommendations(non_pers_reco_solution_val, "non_personalized_recommendations_VAL")

Recommendations were saved on file non_personalized_recommendations_VAL.csv.


## Evaluate results
- Calculate the evaluation metric on the validation users.
- Compare it later with the personalized recommendations

In [22]:
## Second argument is the recommendation file to compare
evaluate_solution('non_personalized_recommendations_VAL', 'validation_recommendations')

0.0035566936975387677

## Create the Ratings Matrix

In [29]:
#LightFM allows to create the rating matrix (aka interaction matrix) and use that matrix to generate recommendations for our users.
#We start by using lightFM Dataset() function to create the user and item mapping that defines the vectorial space of the rating matrix.

# Notice the alias lfmDataset() instead of the standard Dataset() used to distiguish between lightFM Dataset() and another Dataset() that we use later.
lfmdataset = lfmDataset()
lfmdataset.fit(data_train[user_col], data_train[item_col])


In [30]:
(interactions, weights) = lfmdataset.build_interactions((row for row in data_train.values))

print(repr(interactions))

<5687x33509 sparse matrix of type '<class 'numpy.int32'>'
	with 65525 stored elements in COOrdinate format>


## Personalized Recommendations: Collaborative Filtering
- Compute the user similarities matrix.
- Predict ratings.
- Select the best recommendations.
- Submit recommendations.

In [31]:
#Fit LightFM model
lfmodel = LightFM(loss='warp')
lfmodel.fit(interactions)

<lightfm.lightfm.LightFM at 0x7fd1311bc290>

In [23]:
def lightFM_recommendations(dataset,
                            model,
                            user_id_ext_list,
                            non_pers_reco_list,
                            k_top: int = 50,
                            item_features = None):   
    """
    Create output dataframe with recommendations based on dataset, model and list of users.
    
    This function predicts recommendations for users specified in user_id_ext_list that are present in the lightFM dataset.
    New users are recommended the items in the non-personalized list non_per_reco_list.
    
    Parameters:
    -----------
    dataset: lightFM dataset
    
    model: lightFM trained model
    
    user_id_ext_list: list of user external IDs to predict
    
    non_pers_reco: list of non-personalized recommendations ordered from best to worst rated
    
    k_top: number of recommendations to create per user
    
    item_features: lightFM item features
    
    Returns:
    --------
    final_reco_df: dataframe with users' recommendations
    The first column has the users' ID and the remaining columns have the recommendations
    """
    
    assert len(user_id_ext_list) > 0, "User ID list length must be larger than 0."
    
    # Dataset mappings
    user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
    
    # reverse mapping
    item_id_map_reverse = {v: k for k, v in item_id_map.items()}
    user_id_map_reverse = {v: k for k, v in user_id_map.items()}
    
    
    # item internal ids
    item_id_int_list = list(item_id_map.values())
    
    # Split old users (user_id_int_list) from new users (user_id_ext_excluded)
    # Old users are defined in the ratings vectorial space.
    # New users are not defined in the ratings vectorial space.
    # New users receive non-personalized recommendations.
    user_id_int_list = []
    user_id_ext_excluded = []
    
    for user_id_ext in user_id_ext_list:
        try:
            user_id_int_list.append(user_id_map[user_id_ext])
        except:
            user_id_ext_excluded.append(user_id_ext)
    
    # Dataframe to store model recommendations
    model_reco_df = pd.DataFrame()
    
    # Model recommendations
    for user_id in user_id_int_list:
        scores = model.predict(user_id, item_id_int_list, item_features)
        top_items_ids = np.argsort(-scores)
        top_items_ids = [item_id_map_reverse[ids] for ids in top_items_ids]
         
        # Individual row. Two steps are necessary for the first row to call "user_id"
        user_id_df = pd.DataFrame([user_id_map_reverse[user_id]], columns=["user_id"], dtype = int)
        top_items_ids = pd.DataFrame([top_items_ids[:k_top]])
        user_reco_df = pd.concat([user_id_df, top_items_ids], axis=1)
        
        # Concatenating rows
        model_reco_df = pd.concat([model_reco_df, user_reco_df])
        

        
        
    # Non-personalized recommendations
    non_pers_reco_df = non_pers_reco_output(user_id_ext_excluded, non_pers_reco_list)
    
    # Concatenating all recommendations
    if model_reco_df.shape[0] == 0:
        final_reco_df = non_pers_reco_df
    elif non_pers_reco_df.shape[0] == 0:
        final_reco_df = model_reco_df
    else:
        final_reco_df = pd.concat([model_reco_df, non_pers_reco_df])
    
    return final_reco_df

In [33]:
collab_reco_val = lightFM_recommendations(lfmdataset, lfmodel, val_users, non_pers_recommendations, k_top=k_top)

In [34]:
collab_reco_val.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9
0,189334,316666343,312195516,059035342X,0452282152,0671027360,043935806X,345370775,0142001740,0671021001,044023722X
0,112001,316666343,312195516,059035342X,0971880107,043935806X,0142001740,786868716,0671027360,0452282152,044023722X
0,97324,316666343,312195516,059035342X,0671027360,0142001740,043935806X,345370775,044023722X,0452282152,0971880107
0,76499,316666343,312195516,059035342X,043935806X,0971880107,0345370775,452282152,0142001740,0671027360,0439139597
0,203240,316666343,312195516,059035342X,0345370775,043935806X,0142001740,971880107,0452282152,044023722X,1400034779


In [35]:
save_recommendations(collab_reco_val, "collaborative_recommendations_VAL")

Recommendations were saved on file collaborative_recommendations_VAL.csv.


## Evaluate results (Again)
- Calculate the evaluation metric on the validation users.

In [36]:
evaluate_solution('collaborative_recommendations_VAL', 'validation_recommendations')

0.315089188328625

In [38]:
del (interactions, non_pers_reco_solution_val)
gc.collect()

42

## Content-based Recommendations

- Compute the item similarities matrix.
- Predict ratings.
- Select the best recommendations.
- Submit recommendations.

### Item Features

#### Clean up items metadata

In [24]:
items_info

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"['Mark P. O. Morford', 'Robert J. Lenardon']",Provides an introduction to classical myths pl...,808.0,['Social Science']
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,['Richard Bruce Wright'],"In a small town in Canada, Clara Callan reluct...",414.0,['Actresses']
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,"[""Carlo D'Este""]","Here, for the first time in paperback, is an o...",555.0,['1940-1949']
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,['Gina Bari Kolata'],"Describes the great flu epidemic of 1918, an o...",330.0,['Medical']
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,['E. J. W. Barber'],A look at the incredibly well-preserved ancien...,240.0,['Design']
...,...,...,...,...,...,...,...,...,...,...,...,...
112336,1582380805,Tropical Rainforests: 230 Species in Full Colo...,"Allen M., Ph.D. Young",2001,Golden Guides from St. Martin's Press,http://images.amazon.com/images/P/1582380805.0...,http://images.amazon.com/images/P/1582380805.0...,http://images.amazon.com/images/P/1582380805.0...,['Allen M. Young'],A richly illustrated guide to the tropical rai...,160.0,['Nature']
112337,1845170423,Cocktail Classics,David Biggs,2004,Connaught,http://images.amazon.com/images/P/1845170423.0...,http://images.amazon.com/images/P/1845170423.0...,http://images.amazon.com/images/P/1845170423.0...,['David Biggs'],,,
112338,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,"['Robin Wright', 'Doyle McManus']",From two of America's most accomplished journa...,260.0,['Political Science']
112339,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,['Paula Danziger'],"On her own for the first time, fourteen-year-o...",150.0,['Adolescence']


In [25]:
check_for_nans(items_info)

ISBN                       0
Book-Title                 0
Book-Author                0
Year-Of-Publication        0
Publisher                  0
Image-URL-S                0
Image-URL-M                0
Image-URL-L                2
authors                    0
description            14815
pageCount              15197
categories             16914
dtype: int64

In [26]:
items_info.dtypes

ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
Image-URL-S             object
Image-URL-M             object
Image-URL-L             object
authors                 object
description             object
pageCount              float64
categories              object
dtype: object

In [27]:
items_info.describe()

Unnamed: 0,pageCount
count,97144.0
mean,280.954645
std,172.415592
min,1.0
25%,180.0
50%,256.0
75%,352.0
max,5000.0


In [28]:
def get_num_of_unique(df):
    print("UNIQUE ITEMS")
    for column in df.columns:
        print(f"{column}: {df[column].unique().shape[0]}")

In [29]:
get_num_of_unique(items_info)

UNIQUE ITEMS
ISBN: 112341
Book-Title: 106426
Book-Author: 55146
Year-Of-Publication: 174
Publisher: 9452
Image-URL-S: 112256
Image-URL-M: 112256
Image-URL-L: 112255
authors: 57778
description: 94945
pageCount: 1341
categories: 5004


In [30]:
def one_hot_encode(df, column, id_col):
    df_ = df.copy()
    df_ = df_.dropna()
    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = enc.fit_transform(df_[column].values.reshape(-1, 1))
    encoded = pd.DataFrame(encoded.toarray(), columns=enc.get_feature_names())
    
    # filter out categories associated with a single item
    encoded = encoded[encoded.columns[encoded.sum()>1]]
    
    encoded = pd.concat([df_[id_col].reset_index(drop=True), encoded], axis=1)

    return encoded

In [31]:
encoded_publisher = one_hot_encode(items_info, "Publisher", "ISBN")
#encoded_categories = one_hot_encode(items_info, "categories", "ISBN")

In [32]:
encoded_data = encoded_publisher
#encoded_data = encoded_categories.merge(encoded_publisher, on="ISBN")

In [48]:
items_info["Year-Of-Publication"].unique()

array([2002, 2001, 1991, 1999, 2000, 1996, 1988, 1994, 2003, 1998, 2004,
       1997, 1993, 1979, 1995, 1992, 1986, 1978, 1983, 1952, 1987, 1981,
       1990, 1961, 0, 1958, 1989, 1984, 1977, 1982, 1985, 1975, 1965,
       1941, 1970, 1962, 1971, 1972, 1980, 1960, 1974, 1976, 1920, 1973,
       1956, 1959, 1953, 1942, 1963, 1964, 1969, 1950, 1966, 1967, 1957,
       1954, 1940, 1937, 1955, 1968, 1946, 1936, 1925, 1947, 1945, 1943,
       1951, 2005, 1939, 1926, 1938, 1932, 1928, 1949, 1923, 1927, 1930,
       2020, 1911, 1902, 1948, 2038, 1929, 1901, '1995', '1986', '1991',
       '1996', '1971', '2003', '2002', '1993', '1997', '1998', '2001',
       '1994', '1999', '1989', '1984', '2004', '1988', '1978', '1982',
       '1987', '2000', '1990', '1981', '1973', '1992', '1985', '1979',
       '1977', '0', '1969', '1976', '1975', '1980', '1983', '1951',
       '1972', '1974', '2005', '1950', '1964', '1908', '1940', '1959',
       '1970', '1966', '1953', '1962', '1963', '1955', '1901', '194

In [49]:
items_info[items_info["Year-Of-Publication"]== 'DK Publishing Inc']

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
87542,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,,['Michael Teitelbaum'],Provides information on such original characte...,48.0,['Juvenile Fiction']
93024,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,,"['Michael Teitelbaum', 'James Buckley']",Provides information on the artists and writer...,48.0,['Juvenile Nonfiction']


In [33]:
items_info.loc[items_info["Year-Of-Publication"]== 'DK Publishing Inc', "Year-Of-Publication"]=2000

In [34]:
items_info["Year-Of-Publication"] = items_info["Year-Of-Publication"].astype(int)
items_info.describe()

Unnamed: 0,Year-Of-Publication,pageCount
count,112341.0,97144.0
mean,1971.186468,280.954645
std,213.433162,172.415592
min,0.0,1.0
25%,1990.0,180.0
50%,1996.0,256.0
75%,2000.0,352.0
max,2038.0,5000.0


In [35]:
# set impossible years as nan
items_info.loc[items_info["Year-Of-Publication"] == 0, "Year-Of-Publication"] = np.nan
items_info.loc[items_info["Year-Of-Publication"] > 2022, "Year-Of-Publication"] = np.nan


In [36]:
items_info.describe()

Unnamed: 0,Year-Of-Publication,pageCount
count,111038.0,97144.0
mean,1994.262874,280.954645
std,8.004294,172.415592
min,1378.0,1.0
25%,1990.0,180.0
50%,1996.0,256.0
75%,2000.0,352.0
max,2021.0,5000.0


In [37]:
def scale_numeric_data(df, columns, id_col, scaler=StandardScaler()):
    df_ = df.copy()
    id_col_aslist = [id_col]
    id_col_aslist.extend(columns)
    df_ = df_[id_col_aslist]

    df_[columns] = scaler.fit_transform(df_[columns])
    return df_

In [38]:
scaled_numeric_data = scale_numeric_data(items_info, ["Year-Of-Publication", "pageCount"], "ISBN", scaler=StandardScaler())


In [39]:
scaled_numeric_data 

Unnamed: 0,ISBN,Year-Of-Publication,pageCount
0,0195153448,0.966626,3.056847
1,0002005018,0.841693,0.771659
2,0060973129,-0.407642,1.589455
3,0374157065,0.591826,0.284462
4,0393045218,0.591826,-0.237536
...,...,...,...
112336,1582380805,0.841693,-0.701533
112337,1845170423,1.216493,
112338,0449906736,-0.157775,-0.121536
112339,0440400988,-0.782443,-0.759533


In [40]:
items_features_df = scaled_numeric_data.merge(encoded_data, on="ISBN")

In [41]:
items_features_df

Unnamed: 0,ISBN,Year-Of-Publication,pageCount,x0_10-18,x0_1stBooks Library,x0_2.13.61 Publications,x0_21st Century,x0_29th Street Press,x0_A Golden Book,x0_A. Michel,...,x0_Zondervan Publishing Company,x0_Zondervan Publishing House,x0_Zone Books,x0_Zsolnay,x0_Zzdap Publishing,x0_btb,x0_eReads.com,x0_iUniverse,"x0_iUniverse, Inc.",x0_scholastic
0,0195153448,0.966626,3.056847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0002005018,0.841693,0.771659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0060973129,-0.407642,1.589455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0374157065,0.591826,0.284462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0393045218,0.591826,-0.237536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95420,0520242335,1.216493,0.226462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95421,0762412119,0.966626,2.082452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95422,1582380805,0.841693,-0.701533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95423,0449906736,-0.157775,-0.121536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def prepare_data_for_lightfm(df, id_col):
    df_ = df.copy()
    df_.set_index(id_col, drop=True, inplace=True)
    df_.columns = [str(i) for i in range(len(df_.columns))]
    generator = df_.itertuples(index=True, name=None)
    return generator

In [43]:
item_generator = prepare_data_for_lightfm(items_features_df, "ISBN")

In [44]:
item_generator

<zip at 0x7fe7831f2e60>

In [None]:
items_content_dataset = lfmDataset()
items_content_dataset.fit(data_train[user_col], data_train[item_col], item_features=item_generator)