In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#Extracting Data

In [2]:
def load_data(file_name):
  '''
    Menampilkan data yang tersedia.

      Parameters:
        filename (str): Nama file yang tersedia

      Return:
        File Dataframe (DataFrame)
  '''

  df = pd.read_csv(file_name)
  return df

In [3]:
df = load_data('data/fashion_products.csv')
df.head()

Unnamed: 0,User ID,Product ID,Product Name,Brand,Category,Price,Rating,Color,Size
0,19,1,Dress,Adidas,Men's Fashion,40,1.043159,Black,XL
1,97,2,Shoes,H&M,Women's Fashion,82,4.026416,Black,L
2,25,3,Dress,Adidas,Women's Fashion,44,3.337938,Yellow,XL
3,57,4,Shoes,Zara,Men's Fashion,23,1.049523,White,S
4,79,5,T-shirt,Adidas,Men's Fashion,79,4.302773,Black,M


In [4]:
df.describe()

Unnamed: 0,User ID,Product ID,Price,Rating
count,1000.0,1000.0,1000.0,1000.0
mean,50.419,500.5,55.785,2.993135
std,28.78131,288.819436,26.291748,1.153185
min,1.0,1.0,10.0,1.000967
25%,26.0,250.75,33.0,1.992786
50%,50.0,500.5,57.0,2.984003
75%,75.0,750.25,78.25,3.985084
max,100.0,1000.0,100.0,4.987964


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   User ID       1000 non-null   int64  
 1   Product ID    1000 non-null   int64  
 2   Product Name  1000 non-null   object 
 3   Brand         1000 non-null   object 
 4   Category      1000 non-null   object 
 5   Price         1000 non-null   int64  
 6   Rating        1000 non-null   float64
 7   Color         1000 non-null   object 
 8   Size          1000 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 70.4+ KB


#Check Data

In [6]:
def check_data(data):
  '''
    Mengecek data null dan data duplikat.

      Parameters:
        data: Data yang ingin dicek

      Return:
        (str): Jumlah data null dan data duplikat
  '''

  null_data = data.isnull().sum()
  duplicated_data = data.duplicated().sum()

  print(f"Null Data:\n{null_data}\n\nDuplicated Data: {duplicated_data}" )

In [7]:
check_data(df)

Null Data:
User ID         0
Product ID      0
Product Name    0
Brand           0
Category        0
Price           0
Rating          0
Color           0
Size            0
dtype: int64

Duplicated Data: 0


#Cleaning Data

In [8]:
df['Rating'] = df['Rating'].round(0)

In [9]:
df

Unnamed: 0,User ID,Product ID,Product Name,Brand,Category,Price,Rating,Color,Size
0,19,1,Dress,Adidas,Men's Fashion,40,1.0,Black,XL
1,97,2,Shoes,H&M,Women's Fashion,82,4.0,Black,L
2,25,3,Dress,Adidas,Women's Fashion,44,3.0,Yellow,XL
3,57,4,Shoes,Zara,Men's Fashion,23,1.0,White,S
4,79,5,T-shirt,Adidas,Men's Fashion,79,4.0,Black,M
...,...,...,...,...,...,...,...,...,...
995,20,996,Shoes,Zara,Women's Fashion,55,2.0,Black,M
996,42,997,Sweater,Nike,Kids' Fashion,13,2.0,Green,L
997,9,998,Sweater,Zara,Men's Fashion,47,4.0,White,L
998,8,999,T-shirt,Zara,Women's Fashion,68,3.0,Blue,S


### Uitiliy Matrix (COO)

In [10]:
# Load library
from scipy.sparse import coo_matrix

In [11]:
# Prepare the data
row = df['User ID'].values
col = df['Product ID'].values
data = df['Rating'].values

In [12]:
# Create the utility matrix in COO format
coo_data = coo_matrix((data, (row, col)))
print(coo_data)

  (19, 1)	1.0
  (97, 2)	4.0
  (25, 3)	3.0
  (57, 4)	1.0
  (79, 5)	4.0
  (98, 6)	1.0
  (16, 7)	1.0
  (63, 8)	4.0
  (96, 9)	4.0
  (36, 10)	4.0
  (69, 11)	1.0
  (87, 12)	3.0
  (9, 13)	2.0
  (50, 14)	3.0
  (31, 15)	4.0
  (37, 16)	1.0
  (41, 17)	1.0
  (15, 18)	4.0
  (84, 19)	3.0
  (56, 20)	4.0
  (87, 21)	3.0
  (56, 22)	5.0
  (60, 23)	4.0
  (64, 24)	2.0
  (65, 25)	1.0
  :	:
  (82, 976)	2.0
  (4, 977)	4.0
  (23, 978)	4.0
  (77, 979)	2.0
  (28, 980)	5.0
  (35, 981)	3.0
  (50, 982)	4.0
  (94, 983)	4.0
  (70, 984)	3.0
  (57, 985)	3.0
  (1, 986)	5.0
  (77, 987)	3.0
  (80, 988)	3.0
  (67, 989)	5.0
  (51, 990)	3.0
  (60, 991)	5.0
  (42, 992)	5.0
  (89, 993)	4.0
  (59, 994)	4.0
  (34, 995)	4.0
  (20, 996)	2.0
  (42, 997)	2.0
  (9, 998)	4.0
  (8, 999)	3.0
  (91, 1000)	3.0


In [13]:
coo_data.data, len(coo_data.data)

(array([1., 4., 3., 1., 4., 1., 1., 4., 4., 4., 1., 3., 2., 3., 4., 1., 1.,
        4., 3., 4., 3., 5., 4., 2., 1., 5., 3., 3., 5., 3., 3., 3., 5., 4.,
        4., 1., 5., 3., 2., 2., 3., 2., 4., 2., 5., 3., 1., 2., 3., 3., 4.,
        4., 1., 2., 3., 2., 4., 2., 3., 2., 1., 2., 4., 2., 3., 5., 4., 4.,
        4., 3., 5., 3., 3., 3., 3., 3., 2., 1., 1., 3., 2., 5., 1., 3., 1.,
        3., 4., 3., 4., 4., 3., 2., 5., 4., 2., 5., 4., 4., 4., 2., 2., 4.,
        4., 5., 2., 3., 3., 4., 3., 5., 2., 2., 1., 1., 3., 3., 4., 4., 3.,
        3., 2., 4., 3., 1., 4., 5., 5., 2., 2., 1., 3., 3., 2., 4., 4., 1.,
        3., 2., 4., 2., 4., 4., 5., 1., 5., 3., 3., 3., 4., 1., 2., 5., 2.,
        2., 2., 1., 4., 3., 2., 3., 5., 3., 4., 3., 2., 5., 4., 4., 1., 3.,
        2., 5., 3., 3., 2., 2., 5., 2., 1., 2., 5., 4., 3., 3., 2., 2., 3.,
        4., 3., 4., 3., 4., 3., 4., 3., 1., 2., 3., 5., 4., 4., 3., 2., 2.,
        3., 4., 3., 3., 1., 2., 3., 3., 5., 2., 4., 4., 1., 3., 2., 5., 3.,
        1., 

In [14]:
def get_utility_matrix(df):
    """
    Get a COO format utility matrix

    Parameters
    ----------
    rating_data : pandas DataFrame
        The sample of rating data

    Returns
    -------
    coo_data : scipy COO format
        The utility matrix in COO format
    """
    # Prepare the data
    row = df['User ID'].values
    col = df['Product ID'].values
    data = df['Rating'].values

    # Create the utility matrix in COO format
    coo_data = coo_matrix((data, (row, col)))

    return coo_data

In [15]:
coo_data = get_utility_matrix(df = df)
coo_data

<101x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in COOrdinate format>

### Split Train Test

In [16]:
def split_train_test(df, test_size=0.2, random_state=42):
    """
    Function to create train & test utility matrix in COO format

    Parameters
    ----------
    df : pandas DataFrame
        df

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility
    """
    # Generate random seed for reproducibility
    np.random.seed(random_state)

    # Shuffle rating data
    raw_index = df.index.tolist().copy()
    np.random.shuffle(raw_index)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_index))

    # Split the index
    train_index = raw_index[:threshold]
    test_index = raw_index[threshold:]

    # Next, extract the train & test data based on test mask
    coo_data_train = get_utility_matrix(df = df.loc[train_index])
    coo_data_test = get_utility_matrix(df = df.loc[test_index])

    # validate
    print('Train, Test shape:', (coo_data_train.nnz, coo_data_test.nnz))

    return coo_data_train, coo_data_test

In [17]:
# Split the data
coo_data_train, coo_data_test = split_train_test(df = df,
                                                 test_size = 0.2,
                                                 random_state = 42)

Train, Test shape: (800, 200)
