In [9]:
import requests
import pprint
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import random
import networkx as nx
import matplotlib.pyplot as plt
from typing import List
from typing import Dict
from itertools import combinations
import time

## Old

In [10]:
def filter_handles(df: pd.DataFrame, handles=[]) -> pd.DataFrame:
  df = df[df['handle'].isin(handles)]
  return df

def sample_random_users(df: pd.DataFrame, N:int = 5) -> pd.DataFrame:
  handles = list(df['handle'].unique())
  return random.sample(handles, N)

In [11]:
from functools import lru_cache

URL_USER_SUBMISSIONS = 'https://codeforces.com/api/user.status'

@lru_cache(maxsize = 128)
def get_all_problems(df):
    return df['problemID'].unique().tolist()

@lru_cache(maxsize = 128)
def get_all_user_problems(user_handle):
    res = requests.get(URL_USER_SUBMISSIONS,
                       params={'handle': user_handle, 'from': 1,
                       'to': 10000})
    df = pd.DataFrame.from_dict(res.json()['result'])
    df = df[(df['verdict'] == 'OK') | (df['verdict'] == 'WRONG_ANSWER')]
    df = df[['problem', 'verdict']]
    df['problem'] = df['problem'].apply(lambda x: str(x['contestId']) \
            + x['index'])
    df.drop_duplicates(subset=['problem'])
    return df

@lru_cache(maxsize = 128)
def get_solved_problems(df, user_handle):
  return df[df['handle'] == user_handle] 

@lru_cache(maxsize = 128)
def get_pending_problems(user_handle):
    all_problems = get_all_problems()
    solved_problems = get_solved_problems(user_handle)
    df = all_problems[~all_problems['problem'].isin(solved_problems['problem'])]
    return df

## Loading and Sampling Dataset (Diego Canez)

In [12]:
def get_all_user_problems(user_handle: str):
    res = requests.get(URL_USER_SUBMISSIONS,
                       params={'handle': user_handle})
    df = pd.DataFrame.from_dict(res.json()['result'])
    df = df[df['verdict'] == 'OK']
    df = df[['problem']]
    df['problem'] = df['problem'].apply(lambda x: str(x['contestId']) + x['index'] if 'contestId' in x else None)
    df.dropna(inplace=True)
    df.drop_duplicates(subset=['problem'], inplace=True)
    return df['problem'].tolist()

def prepare_dataset(user_handles: List[str]):
  data = {}
  for handle in user_handles:
    data[handle] = get_all_user_problems(handle)
    time.sleep(1)
  return data

In [13]:
def get_users(k=None):
  URL = 'https://codeforces.com/api/user.ratedList?activeOnly=true'
  res = requests.get(URL)
  df = pd.DataFrame.from_dict(res.json()['result'])
  ans = df['handle'].tolist()
  if k is None:
    k = len(ans)
  return random.sample(ans, k)


## Creation of the Graph (Jorge Rebosio)

In [14]:
def add_edges(G: nx.Graph, user_problems_ix: List[int]):
  for u, v in combinations(user_problems_ix, 2):
    if (G.has_edge(u , v)):
      G.edges[u, v]['weight'] += 1 
    else:
      G.add_edge(u, v, weight=1)

def create_graph(user_problems_ix: Dict[str, List[int]]) -> nx.Graph:
  G = nx.Graph()
  for user, problems in user_problems_ix.items():
    add_edges(G, problems)
  return G

### Plotting Subgraph

In [15]:
# Sampling 10 nodes from graph G
def draw_subgraph(G: nx.Graph, N = 10, edge_attribute='weight'):
  H = G.subgraph(list(G.nodes)[:N])
  pos = nx.spring_layout(H)
  nx.draw(H, pos)
  edge_labels = nx.get_edge_attributes(H,edge_attribute)
  nx.draw_networkx_edge_labels(H, pos, edge_labels=edge_labels)
  plt.show()

## Building similarity matrix (Diego Canez)

### Similarity Metrics

#### Edge Weights (EW)
$$
EW(x, y) = A_{x, y}
$$

In [16]:
def edge_weights(G: nx.Graph, attr='weight'):
    N = len(G)
    # EW = defaultdict(lambda: defaultdict(int))
    EW = np.zeros((N, N))
    for (x, y) in tqdm(G.edges()):
      EW[x, y] = G[x][y][attr]
    return EW

#### Weighted Common Neighbors (WCN)
$$
WCN(x, y) = \Sigma_{z \in N(x) \cap N(y)} A_{x, z} + A_{z, y}
$$

In [17]:
def weighted_common_neighbors(G: nx.Graph, attr='weight'):
  N = len(G)
  # WCN = defaultdict(lambda: defaultdict(int))
  WCN = np.zeros((N, N))
  for (x, y) in tqdm(G.edges()):
    for z in nx.common_neighbors(G, x, y):
      val = G[x][z][attr] + G[z][y][attr]
      WCN[x, y] += val
      WCN[y, x] += val
  return WCN

## Produce Recommendations (Maria Lovaton)

Source: https://sci-hub.se/https://doi.org/10.1007/978-3-319-61030-6_7

In [26]:
def recommend(user_handle, sim, user_problems_ix, problems_ix, problems, k=None):
  N = len(problems_ix)
  solved_problems = user_problems_ix[user_handle]
  pending_problems = list(set(range(N)) - set(problems_ix))
  P = [0.0] * N
  for pi in solved_problems:
    for pj in pending_problems:
      if P[pj] < sim[pi, pj]:
        P[pj] = sim[pi, pj]
  ans = list(enumerate(P))
  ans.sort(key=lambda index_score: index_score[1], reverse=True)
  ans = [(problems[ix], score) for ix, score in ans]
  if k is None:
    return ans
  else:
    k = min(len(ans), max(0, k))
    return ans[:k]

## Execution

In [19]:
users = ['dgcnz'] + get_users(k=20)
print(users) 

['dgcnz', 'whdywjd', 'RKdvt28', 'bruhascended', 'ognjentesic', 'ayush.goyal017', 'vaibhav_2003', 'skaty082', 'abdo123', 'uninterested_programmer', 'lalala07', 'mishanya_stulov', 'Accidentally_Coder', 'dcnidan', 'singhshivsm20001', 'ChongJH', 'MD.Abdullah', 'shkoosha', 'BRABUS_666', 'iit2019025', 'rafi1234']


In [20]:
users = ['dgcnz', 'Yinch', 'ginious', 'Gornak40', 'VLADOSIO', 'Elkhiat']

In [21]:
%%time
user_problems = prepare_dataset(users)
problems = list({problem for problem_list in user_problems.values() for problem in problem_list})
problems_ix = {problemId : index for index, problemId in enumerate(problems)}
user_problems_ix = {user : [problems_ix[problem] for problem in problems] for user, problems in user_problems.items()}

CPU times: user 236 ms, sys: 6.85 ms, total: 243 ms
Wall time: 13.8 s


In [22]:
len(problems)

1373

In [23]:
%%time
G = create_graph(user_problems_ix)
sim = weighted_common_neighbors(G)
# sim = edge_weights(G)

100%|████████| 592189/592189 [30:22<00:00, 324.90it/s]

CPU times: user 30min 14s, sys: 6.7 s, total: 30min 21s
Wall time: 30min 23s





In [27]:
%%time
recommend('dgcnz', sim, user_problems_ix, problems_ix, problems, k=5)

CPU times: user 200 ms, sys: 6.52 ms, total: 206 ms
Wall time: 205 ms


[('1610A', 2995.0),
 ('791A', 2995.0),
 ('1393A', 2993.0),
 ('1420C1', 2993.0),
 ('1509C', 2993.0)]