In [12]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import math

# Reading the dataset

In [3]:
users = pd.read_csv('Users.csv', sep=';')
users

Unnamed: 0,User-ID,Age
0,1,
1,2,18
2,3,
3,4,17
4,5,
...,...,...
278854,278854,
278855,278855,50
278856,278856,
278857,278857,


In [24]:
books = pd.read_csv('Books.csv', sep=';')
books 

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [28]:
ratings = pd.read_csv('Ratings.csv', sep=';')
ratings 

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


# Data Cleaning

In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278859 non-null  object
 1   Age      168627 non-null  object
dtypes: object(2)
memory usage: 4.3+ MB


In [6]:
# cleaning Age column
users['Age'] = pd.to_numeric(users['Age'], errors='coerce')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   User-ID  278859 non-null  object 
 1   Age      167151 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.3+ MB


In [25]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ISBN       271379 non-null  object
 1   Title      271379 non-null  object
 2   Author     271377 non-null  object
 3   Year       271379 non-null  int64 
 4   Publisher  271377 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.4+ MB


In [29]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   User-ID  1149780 non-null  int64 
 1   ISBN     1149780 non-null  object
 2   Rating   1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [30]:
# Dropping duplicates
users.drop_duplicates(inplace = True)
ratings.drop_duplicates(inplace = True)
books.drop_duplicates(inplace = True)

# Splitting the dataset

In [10]:
users_train = users[~users.Age.isna()]
users_test  = users[users.Age.isna()]

In [32]:
ratings_train = ratings[ratings['User-ID'].isin(users_train['User-ID'])]
ratings_test = ratings[ratings['User-ID'].isin(users_test['User-ID'])]

# Reading Libsvm files

In [13]:
X_train, y_train = load_svmlight_file('train.libsvm')
X_test,  y_test = load_svmlight_file('test.libsvm')

# Utility Functions

In [15]:
# Function to calculate RMSE
def rmse(a, b):
    return math.sqrt(mean_squared_error(a, b))

In [17]:
# K Fold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Linear Regression

In [53]:
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LinearRegression())
])

pred_lr = cross_val_predict(pipe_lr, X_train, y_train, cv=kf)
rmse_lr = mean_squared_error(y_train, pred_lr)
print("Linear Regression RMSE:", rmse_lr)

pipe_lr.fit(X_train, y_train)

Linear Regression RMSE: 710.2510889525794


# Logistic Regression

In [52]:
pipe_log = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("log", LogisticRegression(multi_class="ovr", max_iter=1000, n_jobs=-1, verbose=0))
])

pred_log = cross_val_predict(pipe_log, X_train, y_train, cv=kf)
rmse_log = mean_squared_error(y_train, pred_log)
print("Linear Regression RMSE:", rmse_log)

pipe_log.fit(X_train, y_train)

KeyboardInterrupt: 

# Decision Tree

In [50]:
tree = DecisionTreeRegressor(random_state=42)
pred_tree = cross_val_predict(tree, X_train, y_train, cv=kf)

rmse_tree = mean_squared_error(y_train, pred_tree)
print("Decision Tree RMSE:", rmse_tree)

tree.fit(X_train, y_train)

Decision Tree RMSE: 272.93749381326705


# Selecting Best Model

In [None]:
rmse_scores = {
    "Linear": rmse_lr,
    "Logistic": rmse_log,
    "Tree": rmse_tree
}

best_model = min(rmse_scores, key=rmse_scores.get)
print("\nBest Model:", best_model)

# Predicting age of Test Users

In [None]:
if best_model == "Linear":
    preds = pipe_lr.predict(X_test)

elif best_model == "Tree":
    preds = tree.predict(X_test)

else:  # Logistic
    preds = pipe_log.predict(X_test)
    # preds = np.array([
    #     bin_mean_age.get(int(b), np.mean(y_train)) 
    #     for b in test_bins
    # ])

# Preparing CSV Output

In [48]:
def createBookList(df):
    s = ''
    df = df.sort_values("ISBN")
    return ','.join(list(df["Title"]))

In [49]:
# preparing the data
merged_df = pd.merge(ratings_test,  books, how='inner', on="ISBN")
results = merged_df.groupby(['User-ID'])[["ISBN","Title"]].apply(lambda x: createBookList(x)).reset_index()
results.rename(columns={0: "List of Book Titles read by the user"}, inplace=True)
results.head(5)
