In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import sklearn
import sys
import catboost



In [None]:
#First need to compile and run Bridges.java, PathsCount.java and ShortestPaths.java
vertices = pd.read_csv("data/vertices.csv")

vertices.main_okved = vertices.main_okved.astype(str)
vertices.company_type = vertices.company_type.astype(str)
N = len(vertices.index)

edges = pd.read_csv("data/edges.csv")
ids = pd.read_csv("data/ids.csv")
bridges = pd.read_csv("data/bridges.csv")

In [None]:
degree = np.zeros(N + 1)
for x in edges.id_1:
    degree[x] += 1
for x in edges.id_2:
    degree[x] += 1
    
vertices['degree'] = vertices.id.apply(lambda x : degree[x])

In [None]:
connectComp = np.zeros(N)
visited = set()
curComp = 1
for i in range(1, N + 1):
    if (i % 10000 == 0):
        print(i)
    if i in visited:
        continue
    q = []
    head = 0
    q.append(i)
    visited.add(i)

    while head < len(q):
        u = q[head]
        head += 1
        connectComp[u - 1] = curComp
        for v in neighborSet[u]:
            if v not in visited:
                visited.add(v)
                q.append(v)
    curComp += 1

vertices['connectedComp'] = connectComp

In [None]:
def addValue(vector, position, value):
    vector[position - 1] = vector[position - 1] + value
    return None

In [None]:
size_value = np.zeros(N)
edges.apply(lambda x : addValue(size_value, int(x.id_1), x.value), axis = 1)
edges.apply(lambda x : addValue(size_value, int(x.id_2), x.value), axis = 1)
vertices['size_value'] = size_value

In [None]:
size_n_transactions = np.zeros(N)
edges.apply(lambda x : addValue(size_n_transactions, int(x.id_1), x.n_transactions), axis = 1)
edges.apply(lambda x : addValue(size_n_transactions, int(x.id_2), x.n_transactions), axis = 1)
vertices['size_n_transactions'] = size_n_transactions

In [None]:
def checkValidity(filename):
    t = pd.read_csv(filename)
    print(t[t.id_1 == t.id_2])
    print(len(t.index))
    t['ttt'] = t.apply( lambda x : min(10000000 * x.id_1 + x.id_2, 10000000 * x.id_2 + x.id_1), axis = 1)
    print(len(t.ttt.unique()))
    
def saveSubmission(res, filename):
    res.columns = ['id_1', 'id_2']
    res.to_csv(filename, index = False)
    print(res.info())

In [None]:
print("Init complete")

In [None]:
############################################################################################################
# End of initialization
############################################################################################################

In [None]:
#20200527 - 2nd attempt
#catboost as classifier
#PU Learning by bagging
#selecting top 100k globally by probability
#500 classifiers for each vertex
#score: 8083 on public

import catboost as cat
from catboost import CatBoostClassifier, Pool

def setValue(vector, position, value):
    vector[position - 1] = value
    return None

def setOne(y, x):
    return setValue(y, x, 1)
    

np.random.seed(239)

cntProcessed = 0

cntNonFirstComponent = 0
cntOldOkved = 0

NUM_ITERATIONS = 100
NUM_MODELS = 500
toProcess = 100

allSim = None

for curId in ids.id[0:toProcess]:
    cntProcessed += 1
    print(curId, cntProcessed, 'of', toProcess)
    curEdges = edges[(edges.id_1 == curId) | (edges.id_2 == curId)].copy()
    curEdges['id'] = curEdges.apply(lambda x : x.id_1 if x.id_1 != curId else x.id_2, axis = 1).astype(int) # apply to each row
    curEdges = curEdges.merge(vertices, on = 'id')

    curBridges = bridges[(bridges.id_1 == curId) | (bridges.id_2 == curId)].copy()
    curBridges = curBridges.rename(columns = {'id_2' : 'id'})
    curBridges = curBridges.merge(vertices, on = 'id')

    curSim = vertices.copy()
    curSim['nonFirstComponent'] = curSim.connectedComp.apply(lambda x : 0 if x == 1.0 else 1)

    isBridge = np.zeros(N)
    curBridges.id.apply(lambda x : setOne(isBridge, x))
    curSim['bridge'] = isBridge
    
    adjValue = np.zeros(N)
    curEdges.apply(lambda x : setValue(adjValue, x.id, x.value), axis = 1)
    curSim['adjValue'] = adjValue

    adjNTransactions = np.zeros(N)
    curEdges.apply(lambda x : setValue(adjNTransactions, x.id, x.n_transactions), axis = 1)
    curSim['adjNTransactions'] = adjNTransactions

    curSim['graphFeature'] = (curSim.bridge + curSim.nonFirstComponent).astype(int)
    
    target = np.zeros(N)
    curEdges.id.apply(lambda x : setOne(target, x))
    curSim['target'] = target
    curSim.target = curSim.target.astype(int)
    curSim.degree = curSim.degree.astype(int)
    curSim['adjDegree'] = curSim.degree - curSim.target

    curSim['adjValue'] = (curSim.size_value - curSim.adjValue)
    curSim['adjNTransactions'] = (curSim.size_n_transactions - curSim.adjNTransactions)

    shortest_paths = pd.read_csv("data/shortest_paths_{curId}.csv".format(curId = curId))
    maxPath = np.max(shortest_paths[shortest_paths.shortest_path < 1000000000].shortest_path)
    curSim['shortestPath'] = shortest_paths.shortest_path.apply(lambda x : x)

    paths_count = pd.read_csv("data/paths_count_{curId}.csv".format(curId = curId))
    curSim['pathsCount'] = paths_count.paths_count.apply(lambda x : np.log(x) if x > 0 else -1)

    pos = curSim[curSim.target == 1]
    neg = curSim[curSim.target == 0]
    neg = neg[neg.id != curId]

    #not sure why I kept graphFeature for the final solution - it is typically least important
    #should have removed it...
    featuresList = ['main_okved', 'region_code', 'company_type', 'graphFeature', \
                    'adjDegree', \
                     'adjValue', 'adjNTransactions', 'shortestPath', 'pathsCount']
    cat_features = [0, 1, 2, 3]


    models = []
    for i in range(0, NUM_MODELS):
        if i % 10 == 0:
            print("    ", i)
        curPos = pos
        curNeg = neg.sample(n = len(curPos.index), replace = True)
        
        
        curData = curPos.append(curNeg)

        X = curData[featuresList]
        y = curData['target']    

        model = CatBoostClassifier(iterations = NUM_ITERATIONS, verbose = False) 
   
        model.fit(X, y, cat_features) 
        models.append(model)

    model = cat.sum_models(models)
    print("Summary model:", model.get_feature_importance())
    
    X = curSim[featuresList]
    preds = model.predict(X)
    
    curSim["Similarity"] = preds
    curSim = curSim.sort_values(by = 'Similarity', ascending = False)
    curSim = curSim[curSim.target != 1.0]
    curSim = curSim[curSim.id != curId]
    curSim['id_2'] = curId
    
    if allSim is None:
        allSim = curSim[0 : 200000]
    else:
        allSim = allSim.append(curSim[0 : 200000])
    
allSim = allSim.sort_values(by = 'Similarity', ascending = False)

res = []
resEdges = set()

cnt = 0
for index, row in allSim.iterrows():
    if (cnt == 100000):
        break
    v = int(row.id)
    curId = int(row.id_2)
    if (v == curId):
        continue
    hashCode = min(10000000 * curId + v, 10000000 * v + curId)
    if (v not in alreadyExist) and (hashCode not in resEdges):
        res.append([curId, v])
        cnt += 1
        resEdges.add(hashCode)

print()

            
res = pd.DataFrame(res)  
print(res[0].value_counts())

In [None]:
#score: 8083 on public
name = 'submission_final_1.csv'
saveSubmission(res, name)
checkValidity(name)

In [None]:
#20200527 - 3rd attempt
#catboost
#baseline with additional features
#selecting top 100k globally by probability
#score: 8118 on public


import catboost as cat
from catboost import CatBoostClassifier, Pool

def setValue(vector, position, value):
    vector[position - 1] = value
    return None

def setOne(y, x):
    return setValue(y, x, 1)
    
np.random.seed(239)


cntProcessed = 0

cntNonFirstComponent = 0
cntOldOkved = 0

NUM_ITERATIONS = 100
NUM_MODELS = 1
toProcess = 100

allSim = None

for curId in ids.id[0:toProcess]:
    cntProcessed += 1
    print(curId, cntProcessed, 'of', toProcess)
    curEdges = edges[(edges.id_1 == curId) | (edges.id_2 == curId)].copy()
    curEdges['id'] = curEdges.apply(lambda x : x.id_1 if x.id_1 != curId else x.id_2, axis = 1).astype(int) # apply to each row
    curEdges = curEdges.merge(vertices, on = 'id')

    curSim = vertices.copy()
    curSim['nonFirstComponent'] = curSim.connectedComp.apply(lambda x : 0 if x == 1.0 else 1)

    
    adjValue = np.zeros(N)
    curEdges.apply(lambda x : setValue(adjValue, x.id, x.value), axis = 1)
    curSim['adjValue'] = adjValue

    adjNTransactions = np.zeros(N)
    curEdges.apply(lambda x : setValue(adjNTransactions, x.id, x.n_transactions), axis = 1)
    curSim['adjNTransactions'] = adjNTransactions
    
    target = np.zeros(N)
    curEdges.id.apply(lambda x : setOne(target, x))
    curSim['target'] = target
    curSim.target = curSim.target.astype(int)
    curSim.degree = curSim.degree.astype(int)
    curSim['adjDegree'] = curSim.degree - curSim.target

    curSim['adjValue'] = (curSim.size_value - curSim.adjValue)
    curSim['adjNTransactions'] = (curSim.size_n_transactions - curSim.adjNTransactions)
    
    shortest_paths = pd.read_csv("data/shortest_paths_{curId}.csv".format(curId = curId))
    curSim['shortestPath'] = shortest_paths.shortest_path.apply(lambda x : x)

    paths_count = pd.read_csv("data/paths_count_{curId}.csv".format(curId = curId))
    curSim['pathsCount'] = paths_count.paths_count.apply(lambda x : np.log(x + 1))

    pos = curSim[curSim.target == 1]
    neg = curSim[curSim.target == 0]
    neg = neg[neg.id != curId]

    featuresList = ['main_okved', 'region_code', 'company_type', \
                    'adjDegree', \
                     'adjValue', 'adjNTransactions', 'shortestPath', 'pathsCount']
    cat_features = [0, 1, 2]

    curPos = pos
    curNeg = neg 
    curData = curPos.append(curNeg)

    X = curData[featuresList]
    y = curData['target']    

    model = CatBoostClassifier(iterations = NUM_ITERATIONS, verbose = False)

    model.fit(X, y, cat_features)

    model = cat.sum_models(models)
    print("Features importance:", model.get_feature_importance())
    
    X = curSim[featuresList]
    preds = model.predict(X)
    
    curSim["Similarity"] = preds
    curSim = curSim.sort_values(by = 'Similarity', ascending = False)
    curSim = curSim[curSim.target != 1.0]
    curSim = curSim[curSim.id != curId]
    curSim['id_2'] = curId
    
    if allSim is None:
        allSim = curSim[0 : 200000]
    else:
        allSim = allSim.append(curSim[0 : 200000])
    
allSim = allSim.sort_values(by = 'Similarity', ascending = False)

res = []
resEdges = set()

cnt = 0
for index, row in allSim.iterrows():
    if (cnt == 100000):
        break
    v = int(row.id)
    curId = int(row.id_2)
    if (v == curId):
        continue
    hashCode = min(10000000 * curId + v, 10000000 * v + curId)
    if (v not in alreadyExist) and (hashCode not in resEdges):
        res.append([curId, v])
        cnt += 1
        resEdges.add(hashCode)

print()

            
res = pd.DataFrame(res)  
print(res[0].value_counts())


In [None]:
#score: 8118 on public
name = 'submission_final_2.csv'
saveSubmission(res, name)
checkValidity(name)

In [None]:
#Since both solutions have similar predictive power and, as it appears, are significantly different, 
#we can merge them to get better solution

def mergeSolutions(f1, f2):
    
    s1 = pd.read_csv(f1)
    s2 = pd.read_csv(f2)

    res = []
    resEdges = set()

    cnt = 0
    for i in range(0, 100000):
        if cnt == 100000:
            break
        for index, row in s1[i:(i+1)].iterrows():
            v = int(row.id_1)
            curId = int(row.id_2)
            if (v == curId):
                continue
            hashCode = min(10000000 * curId + v, 10000000 * v + curId)
            if (hashCode not in resEdges):
                res.append([curId, v])
                cnt += 1
                resEdges.add(hashCode)

        if cnt == 100000:
            break    
        for index, row in s2[i:(i+1)].iterrows():
            v = int(row.id_1)
            curId = int(row.id_2)
            if (v == curId):
                continue
            hashCode = min(10000000 * curId + v, 10000000 * v + curId)
            if (hashCode not in resEdges):
                res.append([curId, v])
                cnt += 1
                resEdges.add(hashCode)
    res = pd.DataFrame(res)
    
    return res


In [None]:
#score: 8767 on public, 8757 on private
res = mergeSolutions("submission_final_1.csv", "submission_final_2.csv")
name = 'submission_final_mix.csv'
saveSubmission(res, name)
checkValidity(name)