In [5]:
import numpy as np
import pandas as pd
import networkx as nx

In [6]:
df_train = pd.read_csv("data/train.csv", usecols=["qid1", "qid2"])
df_test = pd.read_csv("data/test_with_ids.csv", usecols=["qid1", "qid2"])
df_all = pd.concat([df_train, df_test])
print("df_all.shape:", df_all.shape) # df_all.shape: (2750086, 2)
df = df_all
g = nx.Graph()
g.add_nodes_from(df.qid1)
edges = list(df[['qid1', 'qid2']].to_records(index=False))
g.add_edges_from(edges)
g.remove_edges_from(g.selfloop_edges())
print(len(set(df.qid1)), g.number_of_nodes()) # 4789604
print(len(df), g.number_of_edges()) # 2743365 (after self-edges)
df_output = pd.DataFrame(data=g.nodes(), columns=["qid"])
print("df_output.shape:", df_output.shape)
NB_CORES = 20
for k in range(2, NB_CORES + 1):
    fieldname = "kcore{}".format(k)
    print("fieldname = ", fieldname)
    ck = nx.k_core(g, k=k).nodes()
    print("len(ck) = ", len(ck))
    df_output[fieldname] = 0
    df_output.ix[df_output.qid.isin(ck), fieldname] = k
df_output.to_csv("question_kcores.csv", index=None)


df_all.shape: (2750086, 2)
2454493 4789604
2750086 2743365
df_output.shape: (4789604, 1)
fieldname =  kcore2
len(ck) =  104340
fieldname =  kcore3
len(ck) =  37019
fieldname =  kcore4
len(ck) =  20580
fieldname =  kcore5
len(ck) =  13552
fieldname =  kcore6
len(ck) =  9949
fieldname =  kcore7
len(ck) =  7674
fieldname =  kcore8
len(ck) =  6156
fieldname =  kcore9
len(ck) =  5124
fieldname =  kcore10
len(ck) =  4190
fieldname =  kcore11
len(ck) =  3606
fieldname =  kcore12
len(ck) =  2979
fieldname =  kcore13
len(ck) =  2529
fieldname =  kcore14
len(ck) =  2167
fieldname =  kcore15
len(ck) =  1807
fieldname =  kcore16
len(ck) =  1633
fieldname =  kcore17
len(ck) =  1523
fieldname =  kcore18
len(ck) =  1270
fieldname =  kcore19
len(ck) =  1125
fieldname =  kcore20
len(ck) =  1024


In [7]:
df_cores = pd.read_csv("question_kcores.csv", index_col="qid")

df_cores.index.names = ["qid"]

df_cores['max_kcore'] = df_cores.apply(lambda row: max(row), axis=1)

df_cores[['max_kcore']].to_csv("question_max_kcores.csv") # with index

In [8]:
df_cores

Unnamed: 0_level_0,kcore2,kcore3,kcore4,kcore5,kcore6,kcore7,kcore8,kcore9,kcore10,kcore11,kcore12,kcore13,kcore14,kcore15,kcore16,kcore17,kcore18,kcore19,kcore20,max_kcore
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
cores_dict = pd.read_csv("question_max_kcores.csv", index_col="qid").to_dict()["max_kcore"]
def gen_qid1_max_kcore(row):
    return cores_dict[row["qid1"]]
def gen_qid2_max_kcore(row):
    return cores_dict[row["qid2"]]

#def gen_max_kcore(row):
#    return max(row["qid1_max_kcore"], row["qid2_max_kcore"])

df_train["qid1_max_kcore"] = df_train.apply(gen_qid1_max_kcore, axis=1)
df_test["qid1_max_kcore"] = df_test.apply(gen_qid1_max_kcore, axis=1)
df_train["qid2_max_kcore"] = df_train.apply(gen_qid2_max_kcore, axis=1)
df_test["qid2_max_kcore"] = df_test.apply(gen_qid2_max_kcore, axis=1)