In [1]:
"""
Uses the historical list of Kaggle competitions from here:
https://github.com/EliotAndres/kaggle-past-solutions/blob/master/competitions.yaml 

And outputs a CSV file with same information.
"""
import yaml
import pandas as pd
import json


stream = open('competitions.yaml', 'r')
cs = json.dumps(yaml.load(stream))
competitions = pd.read_json(cs, orient='records')
del competitions['userHasEntered'] # Relative to the user getting the data, irrelevant.
competitions.to_csv('competitions.csv', index=False)


In [2]:
"""
Takes a file with Kaggle competition data and outputs a file with the private leaderboards.
It uses competitionId and the internal API used in Kaggle web site (not scraping or the official API).
"""

import pandas as pd
import requests as rq
import time
import json

url = 'https://kaggle.com/c/{}/leaderboard.json?includeBeforeUser=true&includeAfterUser=false&type=private'
competitions = pd.read_csv('competitions.csv')

t = []
for _, row in competitions.iterrows():
    print("Getting data for competition:" + str(row['competitionId']))
    r = rq.get(url.format(row['competitionId']))
    res = r.json()
    if isinstance(res, dict) and res.get("beforeUser", False):
        t.extend(map(lambda x: x.update({'competitionId':row['competitionId']}) or x, res["beforeUser"]))
    if isinstance(res, dict) and res.get("afterUser", False):
        t.extend(map(lambda x: x.update({'competitionId':row['competitionId']}) or x, res["afterUser"]))
    time.sleep(1) # just to be polite to Kaggle.

teams = pd.read_json(json.dumps(t), orient='records')

#----
# Getting a new column to identify teams as (teamName attribute is not unique for members)
# the set of users.
#-----
def getUniqueTeamMembers(teamMembers):
    return sorted(set([t['profileUrl'] for t in teamMembers]))

teams["realTeam"] = teams['teamMembers'].map(getUniqueTeamMembers)


teams.to_csv('teams.csv', index=False)


# ---- Alternative scraping code (finally it was not necessary).
# from requests_html import HTMLSession
#session = HTMLSession()
#r = session.get(url + row['competitionUrl']+ '/leaderboard') 
#r.html.render(sleep=3) # Load as a Web browser will do
#teams = r.html.xpath("//tr[contains(@class, 'competition-leaderboard__row')]")
#for t in teams:
#    print(t.xpath("//td[contains(@class, 'competition-leaderboard__td-team')]/text()"))
#session.close()
#time.sleep(5)


Getting data for competition:6004
Getting data for competition:4699
Getting data for competition:4571
Getting data for competition:4657
Getting data for competition:4494
Getting data for competition:4504
Getting data for competition:4704
Getting data for competition:4521
Getting data for competition:4594
Getting data for competition:4523
Getting data for competition:4487
Getting data for competition:4493
Getting data for competition:4488
Getting data for competition:4481
Getting data for competition:4467
Getting data for competition:4471
Getting data for competition:4453
Getting data for competition:4438
Getting data for competition:4104
Getting data for competition:4407
Getting data for competition:4366
Getting data for competition:4280
Getting data for competition:4272
Getting data for competition:4066
Getting data for competition:3978
Getting data for competition:4031
Getting data for competition:4120
Getting data for competition:3973
Getting data for competition:3984
Getting data f

In [3]:
"""
"""
import pandas as pd
import numpy as np
import networkx as nx
import ast


competitions = pd.read_csv('competitions.csv', parse_dates=['deadline'])
teams = pd.read_csv('teams.csv')

G = nx.Graph()
G.add_nodes_from(competitions['competitionId'])
for key, n in G.nodes.items():
   n["type"] = "competition"
   n["name"] = competitions[competitions.competitionId == key]["competitionTitle"].values[0]
   n["url"] = competitions[competitions.competitionId == key]["competitionUrl"].values[0]


#----
# Getting a new column to identify teams as (teamName attribute is not unique for members)
# the set of users.
#-----
def getUniqueTeamMembers(teamMembers):
    """Gets a Series of strings, evaluates them as Python objects,
    gets the profile URL and then return the sorted sequence.
    """
    return sorted([t["profileUrl"] for t in ast.literal_eval(teamMembers)])

teams["realTeam"] = teams['teamMembers'].map(getUniqueTeamMembers)
print("Number of different teams:"+ str(len(teams.realTeam.transform(tuple).unique())))


G.add_nodes_from(teams.realTeam.transform(tuple).unique(), type="team")


comp_teams = np.column_stack((teams['competitionId'].values, teams['realTeam'].transform(tuple).values))
print(comp_teams)
G.add_edges_from(comp_teams)
nx.write_graphml(G, 'kaggle.xml')




Number of different teams:77566
[[6004 ('/badrobot', '/grt123', '/liaofz')]
 [6004 ('/dhammack', '/juliandewit')]
 [6004 ('/gerbenvv', '/markjan', '/timsalimans')]
 ...
 [8076 ('/ayanmaity',)]
 [8076 ('/msafi04',)]
 [8076 ('/george96uoi',)]]


In [4]:
import matplotlib.pyplot as plt
nx.draw_networkx(G, with_labels=True)
plt.show()


  distance = np.sqrt((delta**2).sum(axis=0))


MemoryError: 

In [6]:
print('Número de nodos:')
print(len(list(G.nodes())))
print('Número de aristas:')
print(len(list(G.edges())))
print('Densidad:')
print(nx.density(G))


Número de nodos:
77715
Número de aristas:
148039
Densidad:
4.9023241493296234e-05


In [7]:
degree_list = []
for v in G:
    degree_list.append(G.degree(v))

print('Max degree:')
print(max(degree_list))
print('Min degree:')
print(min(degree_list))
print('Average degree:')
print(sum(degree_list)/len(degree_list))

Max degree:
7188
Min degree:
0
Average degree:
3.809792189410024


In [8]:
import networkx as nx

In [9]:
print(nx.is_connected(G))

False


In [10]:
print(nx.number_connected_components(G))

3


In [11]:
print(nx.is_directed(G))

False


In [13]:
sorted(G.degree, key=lambda x: x[1], reverse=True)

[(9120, 7188),
 (7082, 5162),
 (4986, 5119),
 (8076, 4547),
 (9717, 4469),
 (8540, 3951),
 (6565, 3829),
 (6649, 3777),
 (4280, 3511),
 (7380, 3339),
 (4594, 3303),
 (6277, 3300),
 (6392, 3269),
 (4852, 2922),
 (6644, 2623),
 (4699, 2616),
 (7559, 2382),
 (5261, 2266),
 (4272, 2258),
 (4471, 2234),
 (4487, 2225),
 (7277, 2153),
 (4853, 2126),
 (5874, 2071),
 (5056, 1973),
 (5260, 1966),
 (8586, 1871),
 (3887, 1788),
 (5558, 1784),
 (4657, 1759),
 (3338, 1688),
 (5340, 1684),
 (7391, 1673),
 (4120, 1604),
 (3788, 1568),
 (4031, 1528),
 (5048, 1440),
 (5357, 1372),
 (2496, 1353),
 (4407, 1327),
 (4467, 1323),
 (7634, 1315),
 (4366, 1307),
 (4704, 1127),
 (4481, 1076),
 (3978, 1050),
 (5497, 979),
 (3897, 951),
 (6322, 937),
 (8310, 935),
 (2551, 925),
 (5144, 924),
 (4504, 833),
 (8089, 743),
 (6927, 735),
 (2780, 699),
 (4488, 674),
 (4104, 662),
 (2748, 658),
 (6049, 655),
 (7878, 652),
 (3951, 634),
 (7115, 627),
 (3526, 626),
 (4862, 598),
 (8078, 580),
 (2564, 571),
 (3445, 554),
 (