In [1]:
import numpy as np
import operator

Recommendation system for shop web-site.
Processing data with unique sessions where users' interacted with shop's web-site. For each session viewed items' indices and bought items' indices are given.
Goals:
- Build recommendation system based on a) viewing popluraty of items b) a) purchasing popluraty of items. 
- Amount of recommended items for user is equal or less than amount of viewed items. 
- Show unique items.
- Evaluate quality by using AP@k and AR@k for k=1 and k=5 where amount of recommended items does not exceed k

Baseline:
    1. Define import functions
    2. Define function that creates dicts for sorting
    3. Define fuction that transforms and sorts sample with dict
    4. Define metrics functions

In [37]:
def import_train_data():
    '''
    Imports train data by-line
    '''
    with open('sessions_train.txt') as file:
        return [i.replace('\n', '').split(';') for i in file.readlines()]

In [39]:
def import_test_data():
    '''
    Imports test data by-line
    '''
    with open('sessions_test.txt') as file:
        return [i.replace('\n', '').split(';') for i in file.readlines()]

Define function that creates two dicts by which samples will be sorted. 

First where keys are viewed items' unique indeces in train sample and values are corresponding quantities. 

Second where keys are bought items' unique indeces in train sample and values are corresponding quantities 

In [42]:
def dicts_creator():

# load train data
    train_data = import_train_data()
    
# create bag of words from all viewed items

    bagow_views = []
    for i in train_data:
        z = i[0].split(',')
        for j in z:
            bagow_views.append(j)

# create bag of words from all bought items (empty sessions excluded)
       
    bagow_purchases = []
    for i in train_data:
        if len(i[1]) > 0:
            z = i[1].split(',')
        for j in z:
            bagow_purchases.append(j)

# create dict of {'unique viewed item index': quantity of views} kind

    counts_views = np.unique(bag_of_words()[0], return_counts=True)
    dictfreq_views = {}
    for i, j in zip(counts_views[0], counts_views[1]):
        dictfreq_views[i] = j
    
# create dict of {'unique purchased item index': quantity of purchases} kind
    
    counts_purchases = np.unique(bag_of_words()[1], return_counts=True)
    dictfreq_purchases = {}
    for i, j in zip(counts_purchases[0], counts_purchases[1]):
        dictfreq_purchases[i] = j
    return dictfreq_views, dictfreq_purchases

Define function that sorts a single line:

In [60]:
def string_sorter(string, dict):
    import operator
    """
    Sorts indeces of items in one session
    """
    z = []
    for i in string:
        try:
            z.append((i, dict[i]))
        except KeyError:
            z.append((i, 0))
    return [i[0] for i in sorted(z, key=operator.itemgetter(1), reverse=True)]

In [6]:
def string_sorter_tester(string_sorter):
    base = {'1':1, '2':2, '3':3, '4':4}
    string = (['1']+['2']+['3']+['4'], ['6']+['2']+['8'])
    ans1 = ['4', '3', '2', '1']
    if string_sorter(string[0], base) != ans1:
        print('test 1 error')
    ans2 = ['2', '6', '8']
    if string_sorter(string[1], base) != ans2:
        print('test 2 error')
    base = {'1':1, '2':2, '3':2, '4':4}
    string = (['1']+['3']+['2']+['4']+['6']+['5'], ['6']+['2']+['8'])
    ans3 = ['4', '3', '2', '1', '6', '5']
    if string_sorter(string[0], base) != ans3:
        print('test 3 error')

In [7]:
string_sorter_tester(string_sorter)

Define function that sorts whole massive:

In [51]:
def massive_sorter(data, dict):
    ''' Sorts viewed indices in massive by-line according to theirs frequencies. Massive must consist
    of lines with a tuple each. Each tuple must contain two lists, both with items indices.
    '''
    
    # Make each line in massive a tuple of 2 lists: unique viewed items indices 
    # and purchased items indices respectively. 
    
    data_prepared = [(np.unique(i[0].split(',')), i[1].split(',')) 
                       for i in data if len(i[1])>0]
    
    # Sort massive by-line
    
    sorted = []
    for i in data_prepared:
        sorted.append((string_sorter(i[0], dict), i[1]))
    return sorted

Metrics. Average precision and average recall are used.

Define functions that count presicion, recall for one session and average precision, recall for whole massive.

In [53]:
def precision_line(line, km):
    '''Counts presicion for a line
    '''
    l = [1 if i in line[1] else 0 for i in line[0]]
    return sum(l[0:km])/km
def precision(massive, km):
    '''Counts average presicion for a massive
    '''
    l = []
    for line in massive:
        l.append(precision_line(line, km))
    return sum(l)/len(l)

In [54]:
def recall_line(line, km):
    '''Counts recall for a line
    '''
    l = [1 if i in line[1] else 0 for i in line[0]]
    return sum(l[0:km])/len(line[1])
def recall(massive, km):
    '''Counts average recall for a massive
    '''
    l = []
    for line in massive:
        l.append(recall_line(line, km))
    return sum(l)/len(l)

In [55]:
def recalltest(recall):
    x1 = [([1, 2, 3], [0, 2]), ([4, 5, 6], [1])]
    if recall(x1, 3) != 0.25:
        print('test1 error')
        return recall(x1, 3)
    x2 = [([1, 2, 3], [0, 2]), ([4, 5], [1, 2, 3])]
    if recall(x2, 3) != 0.25:
        print('test2 error')

In [56]:
recalltest(recall)

Get metrics on train sample:

In [None]:
train_sorted_views = massive_sorter(import_train_data(), dicts_creator()[0])
train_sorted_purchases = massive_sorter(import_train_data(), dicts_creator()[1])

In [58]:
# train sample. Recommendations by views rating
k = 1
r1 = recall(train_sorted_views, k)
print('recall on most viewed items on train sample k=1:', r1)
p1 = precision(train_sorted_views, k)
print('presicion on most viewed items on train sample k=1:', p1)
k = 5
r5 = recall(train_sorted_views, k)
print('recall on most viewed items on train sample k=5:', r5)
p5 = precision(train_sorted_views, k)
print('presicion on most viewed items on train sample k=5:', p5)
with open('quality_views_train.txt', 'w') as f:
    f.write((' ').join(list(map(lambda x: str(round(x, 2)), [r1, p1, r5, p5]))))

recall on most viewed items on train sample k=1: 0.438292114082025
presicion on most viewed items on train sample k=1: 0.5069290465631929
recall on most viewed items on train sample k=5: 0.8238064465406484
presicion on most viewed items on train sample k=5: 0.21213968957872026


In [32]:
# train sample. Recommendations by purchasing rating
k = 1
r1 = recall(train_sorted_purchases, k)
print('recall on most purchased items on train sample k=1:', r1)
p1 = precision(train_sorted_purchases, k)
print('presicion on most purchased items on train sample k=1:', p1)
k = 5
r5 = recall(train_sorted_purchases, k)
print('recall on most purchased items on train sample k=5:', r5)
p5 = precision(train_sorted_purchases, k)
print('presicion on most purchased items on train sample k=5:', p5)
with open('quality_purchases_train.txt', 'w') as f:
    f.write((' ').join(list(map(lambda x: str(round(x, 2)), [r1, p1, r5, p5]))))

recall on most purchased items on train sample k=1: 0.6666722789713365
presicion on most purchased items on train sample k=1: 0.7732815964523282
recall on most purchased items on train sample k=5: 0.9253956378679217
presicion on most purchased items on train sample k=5: 0.25232815964524263


Get metrics on test sample:

In [64]:
test_sorted_views = massive_sorter(import_test_data(), dicts_creator()[0])
test_sorted_purchases = massive_sorter(import_test_data(), dicts_creator()[1])

In [65]:
# test sample. Recommendations by views rating
k = 1
r1 = recall(test_sorted_views, k)
print('recall on most viewed items on test sample k=1:', r1)
p1 = precision(test_sorted_views, k)
print('presicion on most viewed items on test sample k=1:', p1)
k = 5
r5 = recall(test_sorted_views, k)
print('recall on most viewed items on test sample k=5:', r5)
p5 = precision(test_sorted_views, k)
print('presicion on most viewed items on test sample k=5:', p5)
with open('quality_views_test.txt', 'w') as f:
    f.write((' ').join(list(map(lambda x: str(round(x, 2)), [r1, p1, r5, p5]))))

recall on most viewed items on test sample k=1: 0.41399478117759125
presicion on most viewed items on test sample k=1: 0.47748976807639837
recall on most viewed items on test sample k=5: 0.7990271151422681
presicion on most viewed items on test sample k=5: 0.20381991814461664


In [66]:
# test sample. Recommendations by purchasing rating
k = 1
r1 = recall(test_sorted_purchases, k)
print('recall on most purchased items on test sample k=1:', r1)
p1 = precision(test_sorted_purchases, k)
print('presicion on most purchased items on test sample k=1:', p1)
k = 5
r5 = recall(test_sorted_purchases, k)
print('recall on most purchased items on test sample k=5:', r5)
p5 = precision(test_sorted_purchases, k)
print('presicion on most purchased items on test sample k=5:', p5)
with open('quality_purchases_test.txt', 'w') as f:
    f.write((' ').join(list(map(lambda x: str(round(x, 2)), [r1, p1, r5, p5]))))

recall on most purchased items on test sample k=1: 0.4229841486389914
presicion on most purchased items on test sample k=1: 0.48894952251023194
recall on most purchased items on test sample k=5: 0.7964858594403513
presicion on most purchased items on test sample k=5: 0.20414733969986873
