Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assets/zuordnung.json

Large diffs are not rendered by default.

1,136 changes: 1,136 additions & 0 deletions cora_sota.ipynb

Large diffs are not rendered by default.

202 changes: 109 additions & 93 deletions interstate94.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion kaggle_notebooks/cora_getml_vs_gnn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
],
"source": [
"# You might need to restart the kernel after the installs\n",
"%pip install -q \"getml==1.4.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
"%pip install -q \"getml==1.5.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
"\n",
"# Download and extract getML software\n",
"!wget -q https://static.getml.com/download/1.4.0/getml-1.4.0-x64-linux.tar.gz\n",
Expand Down
4 changes: 2 additions & 2 deletions kaggle_notebooks/epilepsy_recognition.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@
}
],
"source": [
"%pip install -q \"getml==1.4.0\" \"numpy<2.0.0\" \"matplotlib~=3.9\" \"seaborn~=0.13\""
"%pip install -q \"getml==1.5.0\" \"numpy<2.0.0\" \"matplotlib~=3.9\" \"seaborn~=0.13\""
]
},
{
Expand Down Expand Up @@ -4559,4 +4559,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
2 changes: 1 addition & 1 deletion kaggle_notebooks/getml-and-gnns-a-natural-symbiosis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
],
"source": [
"# You might need to restart the kernel after the installs\n",
"%pip install -q \"getml==1.4.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
"%pip install -q \"getml==1.5.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
"\n",
"# Download and extract getML software\n",
"!wget -q https://static.getml.com/download/1.4.0/getml-1.4.0-x64-linux.tar.gz\n",
Expand Down
14,046 changes: 7,023 additions & 7,023 deletions loans.ipynb

Large diffs are not rendered by default.

11,419 changes: 5,705 additions & 5,714 deletions movie_lens.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
jupyterlab==4.1.1
getml==1.4.0
getml==1.5.0
featuretools==1.31.0
tsfresh==0.20.3
pyspark==3.5.0
seaborn==0.13.2
ipywidgets==8.1.2
plotly==5.18.0
prophet==1.1.5
matplotlib==3.8.2
matplotlib==3.8.2
56 changes: 56 additions & 0 deletions utils/zuordnung.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

import json
import numpy as np
from torch_geometric.datasets import Planetoid


def run_zuordnung(getml_word_data):
"""
The matching process is based on the word matrix of the abstracts' content. That data is stored differently in the data of Izadi et al's GNN paper (hereinafter referred to as GNN paper or GNN data) and getML's data source. In the GNN's case, words are stored one-hot-encoded in a matrix (e.g. [0,0,1,0,1]), while getML data source simply lists the words and their associated index in the on-hot-encoded word matrix (e.g.: [word2, word4]). The following routine first retrieves the index of the GNN matrix. Due to different offsets, the word indices between both data source do not align. Therefore, we compute the difference between adjacent word indices and compare them across sources. If the patterns match, we have found a match between both sources and save their associated dataframe indices.

It turns out there is a perfect match between both sources and every observation in one source finds its counterpart in the other source.
"""

getml_word_data = getml_word_data.to_pandas()

gnn_word_data = Planetoid(name="Cora", root="")

zuordnung = []
for getml_idx in getml_word_data["paper_id"].unique():
getml_positions = [
int(ele[4:])
for ele in getml_word_data[getml_word_data["paper_id"] == getml_idx][
"word_cited_id"
].values
]
getml_positions = np.sort(getml_positions)

getml_words_pattern = [
j - i for i, j in zip(getml_positions[:-1], getml_positions[1:])
]

for gnn_idx in range(len(gnn_word_data[0].x)):
gnn_positions = [
i
for i, x in enumerate(
[int(x) for x in list(gnn_word_data[0].x[gnn_idx])]
)
if x == 1
]
gnn_words_pattern = [
j - i for i, j in zip(gnn_positions[:-1], gnn_positions[1:])
]

if gnn_words_pattern == getml_words_pattern:
match = (int(getml_idx), gnn_idx)
zuordnung.append(match)
break


with open('assets/zuordnung.json', 'w') as file:
print("Writing to file")
json.dump(zuordnung, file)
print('Done')

print(zuordnung)