In [1]:
import time
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# movielens, column order: user id, item id, ratings and timestamp
# the fourth column is the timestamp, exclude it
train = pd.read_csv( 'data/u1.base', sep = '\t', header = None )
train = train.iloc[ :, 0:3 ]
test  = pd.read_csv( 'data/u1.test', sep = '\t', header = None )
test  = test.iloc[ :, 0:3 ]
column_names  = [ 'user_ids', 'item_ids', 'ratings' ]
train.columns = column_names
test.columns  = column_names

# make sure all the items and users that are in the testing data
# has been seen in training 
contain_items = test['item_ids'].isin( train['item_ids'].unique() )
contain_users = test['user_ids'].isin( train['user_ids'].unique() )
test = test[ contain_users & contain_items ]
print(train.shape)
print(test.shape)
train.head()

(80000, 3)
(19968, 3)


Unnamed: 0,user_ids,item_ids,ratings
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [3]:
# only retain users that are frequently rating 
# change the quantile for percentage that are retained
"""
histogram_counts = train['user_ids'].value_counts()
df_value_counts  = pd.DataFrame( histogram_counts > histogram_counts.quantile(0.5) )
df_value_counts  = df_value_counts.reset_index()
frequent_users   = df_value_counts.loc[ df_value_counts['user_ids'], 'index' ]

train = train[ train['user_ids'].isin(frequent_users) ]
test  = test[ test['user_ids'].isin(frequent_users) ]
"""
print(train.shape)
print(test.shape)

(80000, 3)
(19968, 3)


In [19]:
from recommender import BIKNN, GABIKNN

In [20]:
# this runs a single BIKNN
# the fit stage is about 646 seconds, 10.7 minute
start1 = time.time()

biknn1 = BIKNN( K = 20, B1 = 25, B2 = 25, iterations = 100000 )
biknn1.fit( data = train, column_names = [ 'user_ids', 'item_ids', 'ratings' ] )

# a boolean value indicating whether you have fitted the model
# biknn1.is_fitted

end1 = time.time()
elapse1 = end1 - start1
elapse1

639.8552379608154

In [5]:
# test around 140 seconds, 2.3 minutes
start2 = time.time()

pred = biknn1.predict(test)

end2 = time.time()
elapse2 = end2 - start2
elapse2

142.879625082016

In [7]:
biknn1.evaluate( pred, test['ratings'] )

0.7558124882518507

In [22]:
start3 = time.time()

ga1 = GABIKNN( 
    generation = 2,
    pop_size = 5,
    low = 0, 
    high = 100, 
    retain_rate = 0.5, 
    mutate_rate = 0.2,
    BIKNN = biknn1,
    verbose = True
)
ga1.fit(test)

end3 = time.time()
elapse3 = end3 - start3
elapse3

2206.8340170383453

In [25]:
ga1.generation_history

[info(cost=0.6403400368509364, chromo=[42, 65]),
 info(cost=0.6328918704783106, chromo=[31, 65])]

In [23]:
ga1.convergence_plot()

AttributeError: 'GABIKNN' object has no attribute 'convergence_plot'