In [1]:
import pandas as pd
import numpy as py

# Data

In [2]:
transections = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

In [3]:
transection_hist = transections[['customer_id', 'article_id']]
transection_hist.head()

In [4]:
customer_age = customers[['customer_id', 'age']]
customer_age.head()

In [5]:
customer_age['age'].describe()

In [6]:
article_color = articles[['article_id', 'colour_group_code']].rename({'colour_group_code':'color_code'}, axis=1)
article_color.head()

In [7]:
article_color['color_code'].unique()

In [8]:
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [9]:
submission.head()

# Colaborative Filtering

In [10]:
itemsPerUser = {}
usersPerItem = {}

In [None]:
for i in range(transection_hist.shape[0]):
    customer = transection_hist.iloc[i]['customer_id']
    article = transection_hist.iloc[i]['article_id']
    if customer not in itemsPerUser:
        itemsPerUser[customer] = [article]
    else:
        itemsPerUser[customer].append(article)
    if article not in usersPerItem:
        usersPerItem[article] = [customer]
    else:
        usersPerItem[article].append(customer)
    

In [17]:
# compute jaccard similarity between set a and b
def jaccard(s1, s2):
    nominator = s1.intersection(s2)
    denominator = s1.union(s2)
    similarity = len(nominator)/len(denominator)
    return similarity

In [18]:
# find similar customers across entire customer set
def similar_customers(c):
    customer_similarities = []
    for customer, articles in itemsPerUser.items():
        if customer != c:
            similarity = jaccard(set(articles), set(itemsPerUser[c]))
            customer_similarities.append((similarity, customer))
    customer_similarities.sort(reverse=True)
    best_similarity = customer_similarities[0][0]
    customers = []
    for pair in customer_similarities:
        if pair[0] < best_similarity:
            break
        customers.append(pair[1])
    return customers

In [27]:
# find similar items across the entire item set
def similar_items(i):
    item_similarities = []
    for article, customers in usersPerItem.items():
        if article != i:
            similarity = jaccard(set(customers), set(usersPerItem[i]))
            item_similarities.append((similarity, article))
    item_similarities.sort(reverse=True)
    best_similarity = item_similarities[0][0]
    items = []
    for pair in item_similarities:
        if pair[0] < best_similarity:
            break
        items.append(pair[1])
    return items

In [20]:
# find subset of s2 that similar to s1
def similar_subset(s1, s2):
    items = []
    for i in s1:
        item_similarities = []
        for j in s2:
            similarity = jaccard(set(usersPerItem[i]), set(usersPerItem[j]))
            item_similarities.append((similarity, j))
        item_similarities.sort(reverse=True)
        best_similarity = item_similarities[0][0]
        for pair in item_similarities:
            if pair[0] < best_similarity:
                break
            items.append(pair[1])
    return items

In [67]:
# recommend items to customer 'c' basesd on similarities between purchase history of c and s
def recommend(c, s):
    recommendations = []
    inter = set(itemsPerUser[s]).intersection(set(itemsPerUser[c]))
    s_diff = set(itemsPerUser[s]).difference(inter) # s-intersection
    c_diff = set(itemsPerUser[c]).difference(inter) # c-intersection
    # find subset of s_diff that similar to intersection
    if len(s_diff) != 0:
        recommend_items = similar_subset(inter, s_diff)
        for i in recommend_items:
            if i not in recommendations:
                recommendations.append(i)
            
    # find subset of all items that are similar to intersection
    if len(s_diff) == 0:
        for i in inter:
            recommend_items = similar_items(i)
            for j in recommend_items:
                if j not in recommendations:
                    recommendations.append(j)
    return recommendations

In [68]:
c = submission.iloc[0][0]
s = similar_customers(c)
s

In [70]:
recommendations = []
for customer in s:
    recommend_articles = recommend(c, customer)
    for article in recommend_articles:
        if len(recommendations) < 12:
            recommendations.append(article)
        else:
            break
    if len(recommendations) == 12:
        break
recommendations

In [71]:
recommend_details = articles[articles['article_id']==recommendations[0]]
for article in recommendations[1:]:
    recommend_details = recommend_details.append(articles[articles['article_id']==article], ignore_index=True)
recommend_details

# Submission

In [73]:
# submission = submission.drop(['prediction'], axis=1)
# submission.head()

In [72]:
# predictions = []
# for c in submission['customer_id'].values:
#     s = similar_customers(c)[0]
#     recommendations = recommend(c, s)
#     predictions.append(' '.join(str(i) for i in recommendations))        

In [None]:
# submission['prediction'] = predictions
# submission.head()