In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

# Reading the dataset

In [2]:
users = pd.read_csv('Users.csv', sep=';')
users

Unnamed: 0,User-ID,Age
0,1,
1,2,18
2,3,
3,4,17
4,5,
...,...,...
278854,278854,
278855,278855,50
278856,278856,
278857,278857,


In [3]:
ratings = pd.read_csv('Ratings.csv', sep=';')
ratings 

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


# Data Cleaning

In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278859 non-null  object
 1   Age      168627 non-null  object
dtypes: object(2)
memory usage: 4.3+ MB


In [5]:
# cleaning Age column
users['Age'] = pd.to_numeric(users['Age'], errors='coerce')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   User-ID  278859 non-null  object 
 1   Age      167151 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.3+ MB


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   User-ID  1149780 non-null  int64 
 1   ISBN     1149780 non-null  object
 2   Rating   1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [7]:
# Dropping duplicates
users.drop_duplicates(inplace = True)
ratings.drop_duplicates(inplace = True)

In [8]:
# mapping ISBN numbers from string to int data
isbn_mapping = {}
for idx, isbn in enumerate(list(ratings.ISBN.unique())):
    isbn_mapping[isbn] = idx+1
ratings['ISBN'] = ratings['ISBN'].map(isbn_mapping)

# Splitting the dataset

In [9]:
users_train = users[~users.Age.isna()]
users_test  = users[users.Age.isna()]

In [10]:
ratings_train = ratings[ratings['User-ID'].isin(users_train['User-ID'])]
ratings_test = ratings[ratings['User-ID'].isin(users_test['User-ID'])]

# Transforming the data into libsvm format

In [11]:
def createString(df):
    s = ''
    df = df.sort_values("ISBN")
    for index, row in df.iterrows():
        s += str(row.ISBN) + ':' + str(row.Rating) + ' '
    return s.strip()

In [12]:
def createlibsvmFile(df, file_name, target_col = "Age"):
    with open(f"{file_name}.libsvm", "w") as f:
        for i, row in df.iterrows():
            f.write(f"{row[target_col]} {row['Features']}\n")
    

In [13]:
# Preparing train data 
train = ratings_train.groupby(['User-ID'])[["ISBN","Rating"]].apply(lambda x: createString(x)).reset_index()
train = pd.merge(train,  users_train, how='inner', on="User-ID")
train.rename(columns={0: "Features"}, inplace=True)
print(train.head(2))
print("Null values count:\n", train.isna().sum())
createlibsvmFile(train, "train")

  User-ID       Features   Age
0       2         8968:0  18.0
1      10  8989:0 8990:6  26.0
Null values count:
 User-ID     0
Features    0
Age         0
dtype: int64


In [14]:
# Preparing test data 
test = ratings_test.groupby(['User-ID'])[["ISBN","Rating"]].apply(lambda x: createString(x)).reset_index()
test['Age'] = [0] * test.shape[0] # Sets the target variable to a default value (0) for LibSVM compatibility
test.rename(columns={0: "Features"}, inplace=True)
print(test.head(2))
print("Null values count:\n", test.isna().sum())
createlibsvmFile(test, "test")

   User-ID                                           Features  Age
0        7                                             8969:0    0
1        8  8970:5 8971:0 8972:0 8973:0 8974:0 8975:0 8976...    0
Null values count:
 User-ID     0
Features    0
Age         0
dtype: int64
