<H3>Load libraries</H3>

In [1]:
import numpy as np
import os
import random
import matplotlib.pyplot as plt
import pandas as pd
import gc
from sklearn.cross_validation import KFold

import graphlab as gl

<H3>Load submissions data</H3>

In [2]:
submissions = pd.read_csv('submissions.csv')

In [3]:
submissions.head()

Unnamed: 0,hacker_id,contest_id,challenge_id,language,solved,created_at
0,56d47667c6a9242e,c8ff662c97d345d2,6d95afeb4d968b91,java,0,2011-08-07 00:36:53 UTC
1,56d47667c6a9242e,c8ff662c97d345d2,6d95afeb4d968b91,java,0,2011-08-07 00:38:45 UTC
2,56d47667c6a9242e,c8ff662c97d345d2,6d95afeb4d968b91,java,0,2011-08-07 00:43:49 UTC
3,56d47667c6a9242e,c8ff662c97d345d2,6d95afeb4d968b91,java,0,2011-08-07 00:52:34 UTC
4,56d47667c6a9242e,c8ff662c97d345d2,6d95afeb4d968b91,java,0,2011-08-07 00:56:52 UTC


In [4]:
sub1 = submissions[['hacker_id', 'contest_id', 'challenge_id', 'solved']]

In [5]:
sub1 = sub1[sub1['contest_id']=='c8ff662c97d345d2']

In [6]:
sub1.drop('contest_id', axis = 1, inplace = True)

In [7]:
sub1['final_solved'] = sub1.groupby(['hacker_id', 'challenge_id'])['solved'].transform('max')

In [8]:
sub1.drop('solved', axis = 1, inplace = True)

In [9]:
sub1 = sub1[sub1['final_solved']==0]

In [10]:
sub1 = sub1.drop_duplicates()

In [11]:
sub1.shape

(13247, 3)

<H3>Taking a subset of challenges in the contest c8ff662c97d345d2 which were attempted but not solved by hackers</H3>

In [12]:
sub1.head()

Unnamed: 0,hacker_id,challenge_id,final_solved
0,56d47667c6a9242e,6d95afeb4d968b91,0
10,661657184c3d9218,6d95afeb4d968b91,0
11,05d5253080e43d9f,a4d04413ee4b0b24,0
12,661657184c3d9218,a4d04413ee4b0b24,0
14,661657184c3d9218,127df5e318d03214,0


In [13]:
sub2 = submissions[['hacker_id', 'challenge_id', 'created_at']]

In [14]:
sub1 = pd.merge(sub1, sub2, on = ['hacker_id', 'challenge_id'], how = 'left')

In [16]:
sub1['time'] = sub1.groupby(['hacker_id', 'challenge_id'])['created_at'].transform('max')

In [18]:
sub1 = sub1[['hacker_id', 'challenge_id', 'time']]

In [20]:
sub1 = sub1.drop_duplicates().reset_index(drop = True)

In [21]:
sub1.shape

(13247, 3)

<H3>Sorting the unsolved challenges based on time</H3>

In [22]:
sub1 = sub1.sort(['hacker_id', 'time'], ascending = [False, False])

  if __name__ == '__main__':


In [23]:
sub1.head(10)

Unnamed: 0,hacker_id,challenge_id,time
11231,fffcfd13f3e0969d,7b6a8414a1e8273c,2016-07-01 03:43:06 UTC
10522,fffcfd13f3e0969d,659f2ceb67eca1ef,2016-06-10 13:06:39 UTC
5352,ffe228e656e5cbbd,659f2ceb67eca1ef,2015-09-21 14:46:39 UTC
5350,ffe228e656e5cbbd,10fbfe48f770e0e9,2015-09-21 14:07:12 UTC
10609,ffe0dff4db6deb6e,7de92abeb686999e,2016-06-13 16:47:44 UTC
10473,ffe0dff4db6deb6e,9aba48565107d1ce,2016-06-09 04:48:06 UTC
12891,ffcae6424b43fbb2,59ec55c0e537bfcf,2016-08-13 04:54:17 UTC
1283,ffc55c6b4fd6753d,5f6525508e7e07b7,2014-07-02 01:34:11 UTC
1260,ffc55c6b4fd6753d,1e5b724075cd3d1e,2014-06-25 13:35:42 UTC
5389,ffb87a9ca8233500,daaa4f5cca620546,2015-09-23 17:38:11 UTC


In [24]:
unsolve_hacker_list = list(sub1['hacker_id'].unique())

In [25]:
len(unsolve_hacker_list)

5809

<H3>Load challenges data</H3>

In [26]:
challenges = pd.read_csv('challenges.csv')

In [27]:
user_list = list(submissions['hacker_id'].unique())

<H3>Create an SFrame</H3>

In [28]:
train_data = gl.SFrame(submissions[submissions['solved']==1])

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\ERNEST~1.S\AppData\Local\Temp\graphlab_server_1473014713.log.0


This non-commercial license of GraphLab Create for academic use is assigned to ernest.kirubakaran@gmail.com and will expire on August 31, 2017.


<H3>Train the model</H3>

In [29]:
model = gl.item_similarity_recommender.create(train_data, user_id = 'hacker_id', item_id= 'challenge_id', similarity_type='jaccard')

<H3>Get recommendations for users</H3>

In [30]:
recommendation = model.recommend(users = user_list, k = 50)

In [31]:
challenge_contest = challenges[['challenge_id', 'contest_id']]

In [32]:
challenge_contest = challenge_contest.drop_duplicates()

In [33]:
recommendation_df = recommendation.to_dataframe()

In [34]:
recommendation_df = pd.merge(recommendation_df, challenge_contest, on = 'challenge_id', how = 'left')

<H3>Keep only valid challenges in the recommendations</H3>

In [35]:
recommendation_df = recommendation_df[recommendation_df['contest_id'] == 'c8ff662c97d345d2']

In [36]:
recommendation_df.shape

(484380, 5)

In [37]:
recommendation_df.drop(['contest_id', 'rank'], axis = 1, inplace = True)

In [38]:
recommendation_df['hacker_count'] = recommendation_df.groupby(['hacker_id'])['challenge_id'].transform('count')

In [39]:
recommendation_df['hacker_count'].describe()

count    484380.000000
mean         49.212098
std           3.791646
min           2.000000
25%          50.000000
50%          50.000000
75%          50.000000
max          50.000000
Name: hacker_count, dtype: float64

In [40]:
recommendation_df['challenge_count'] = recommendation_df.groupby(['challenge_id'])['score'].transform('count')

In [41]:
chal_counts = set(list(recommendation_df['challenge_count']))

<H3>Get top 10 challenges in the prediction</H3>

In [42]:
top_challenges = []
for i in (sorted(chal_counts)[-10:]):
    j = recommendation_df[recommendation_df['challenge_count']==i].head(1)['challenge_id'].values[0]
    top_challenges.append(j)

In [43]:
top_challenges

['9580a525da7618e9',
 '34386eca594b7bc2',
 'a8fa4ef7a8bc43dc',
 '32863af4fa5c200b',
 'b49bb8ec9c2b8481',
 '286001f675d5dc00',
 '76fba8ff25f765b6',
 '425f241df843cd89',
 '26c95b75d8237f39',
 '14494ad91c883fa1']

<H3>Find out hackers for whom the number of predictions is less than 10</H3>

In [44]:
less_hackers = recommendation_df[recommendation_df['hacker_count'] < 10]

In [45]:
less_hacker_list = list(less_hackers['hacker_id'].unique())

In [46]:
len(less_hacker_list)

48

In [47]:
pred_hacker_list = list(recommendation_df['hacker_id'].unique())

In [48]:
len(pred_hacker_list)

9998

In [49]:
total_hacker_list = list(submissions['hacker_id'].unique())

<H3>Hackers for whom no recommendation was made</H3>

In [50]:
missing_hacker_list = [i for i in total_hacker_list if i not in pred_hacker_list]

In [51]:
missing_hacker_list

['14128435fc0eb297', 'd639fbb774b9d862']

<H3>Hackers for whom 10 recommendations are available</H3>

In [52]:
pred_hacker_list = [i for i in pred_hacker_list if i not in less_hacker_list]

In [53]:
len(pred_hacker_list)

9950

In [54]:
recommendation_df.drop(['hacker_count', 'challenge_count'], axis = 1, inplace = True)

In [55]:
recommendation_df = recommendation_df.sort(['hacker_id', 'score'], ascending = [False, False])

  if __name__ == '__main__':


<H3>Saving results for users having 10 recommendations</H3>
<H4>Top two unsolved challenges are included for every user. The rest 8 recommendations are from the model</H4>

In [56]:
result = []
for user in pred_hacker_list:
    temp_df = recommendation_df[recommendation_df['hacker_id']==user].reset_index(drop = True)
    if user in unsolve_hacker_list:
        unsolve_list = list(sub1[sub1['hacker_id']==user]['challenge_id'].unique())
        a = len(unsolve_list)
        b = 0
        if a > 2:
            unsolve_list = unsolve_list[:2]
        a = len(unsolve_list)
        res = [user]
        res = res + unsolve_list
        while a < 10:
            c = temp_df['challenge_id'][b]
            if c not in unsolve_list:
                res.append(c)
                a += 1
            b += 1
    else:
        res = [user]
        for i in xrange(10):
            res.append(temp_df['challenge_id'][i])
    reso = res[0] + ',' + res[1] + ',' + res[2] + ',' + res[3] + ',' + res[4] + ',' + res[5] + ',' + res[6] + ',' + res[7] + ',' + res[8] + ',' + res[9] + ',' + res[10]
    result.append(reso)      

<H3>For users with less recommendations, top challenges in recommendations are added</H3>

In [57]:
for user in less_hacker_list:
    temp_df = recommendation_df[recommendation_df['hacker_id']==user].reset_index(drop = True)
    res = [user]
    res = res + list(temp_df['challenge_id'])
    a = (len(res) -1)
    while a < 10:
        res.append(top_challenges[a])
        a += 1
    reso = res[0] + ',' + res[1] + ',' + res[2] + ',' + res[3] + ',' + res[4] + ',' + res[5] + ',' + res[6] + ',' + res[7] + ',' + res[8] + ',' + res[9] + ',' + res[10]
    result.append(reso)      

In [58]:
for user in missing_hacker_list:
    res = [user] + top_challenges
    reso = res[0] + ',' + res[1] + ',' + res[2] + ',' + res[3] + ',' + res[4] + ',' + res[5] + ',' + res[6] + ',' + res[7] + ',' + res[8] + ',' + res[9] + ',' + res[10]
    result.append(reso)

In [59]:
result = pd.DataFrame(result)

In [60]:
result.shape

(10000, 1)

<H3>Saving the result</H3>

In [61]:
result.to_csv('recommendation.csv', header = False, index = False) 