getml · alxn4 · Dec 4, 2024 · Sep 30, 2024 · Oct 14, 2024 · Nov 19, 2024
diff --git a/assets/zuordnung.json b/assets/zuordnung.json
diff --git a/cora_sota.ipynb b/cora_sota.ipynb
diff --git a/interstate94.ipynb b/interstate94.ipynb
diff --git a/kaggle_notebooks/cora_getml_vs_gnn.ipynb b/kaggle_notebooks/cora_getml_vs_gnn.ipynb
@@ -81,7 +81,7 @@
    ],
    "source": [
     "# You might need to restart the kernel after the installs\n",
-    "%pip install -q \"getml==1.4.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
+    "%pip install -q \"getml==1.5.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
     "\n",
     "# Download and extract getML software\n",
     "!wget -q https://static.getml.com/download/1.4.0/getml-1.4.0-x64-linux.tar.gz\n",

diff --git a/kaggle_notebooks/epilepsy_recognition.ipynb b/kaggle_notebooks/epilepsy_recognition.ipynb
@@ -199,7 +199,7 @@
                 }
             ],
             "source": [
-                "%pip install -q \"getml==1.4.0\" \"numpy<2.0.0\" \"matplotlib~=3.9\" \"seaborn~=0.13\""
+                "%pip install -q \"getml==1.5.0\" \"numpy<2.0.0\" \"matplotlib~=3.9\" \"seaborn~=0.13\""
             ]
         },
         {
@@ -4559,4 +4559,4 @@
     },
     "nbformat": 4,
     "nbformat_minor": 4
-}
+}
diff --git a/kaggle_notebooks/getml-and-gnns-a-natural-symbiosis.ipynb b/kaggle_notebooks/getml-and-gnns-a-natural-symbiosis.ipynb
@@ -110,7 +110,7 @@
    ],
    "source": [
     "# You might need to restart the kernel after the installs\n",
-    "%pip install -q \"getml==1.4.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
+    "%pip install -q \"getml==1.5.0\" \"torch-geometric~=2.5\" \"pandas~=2.2\" \"matplotlib~=3.9\" \"seaborn~=0.13\" \"numpy~=1.26\" \"torch~=2.4\"\n",
     "\n",
     "# Download and extract getML software\n",
     "!wget -q https://static.getml.com/download/1.4.0/getml-1.4.0-x64-linux.tar.gz\n",

diff --git a/loans.ipynb b/loans.ipynb
diff --git a/movie_lens.ipynb b/movie_lens.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
 jupyterlab==4.1.1
-getml==1.4.0
+getml==1.5.0
 featuretools==1.31.0
 tsfresh==0.20.3
 pyspark==3.5.0
 seaborn==0.13.2
 ipywidgets==8.1.2
 plotly==5.18.0
 prophet==1.1.5
-matplotlib==3.8.2
+matplotlib==3.8.2
diff --git a/utils/zuordnung.py b/utils/zuordnung.py
@@ -0,0 +1,56 @@
+
+import json
+import numpy as np
+from torch_geometric.datasets import Planetoid
+
+
+def run_zuordnung(getml_word_data):
+    """
+    The matching process is based on the word matrix of the abstracts' content. That data is stored differently in the data of Izadi et al's GNN paper (hereinafter referred to as GNN paper or GNN data) and getML's data source. In the GNN's case, words are stored one-hot-encoded in a matrix (e.g. [0,0,1,0,1]), while getML data source simply lists the words and their associated index in the on-hot-encoded word matrix (e.g.: [word2, word4]). The following routine first retrieves the index of the GNN matrix. Due to different offsets, the word indices between both data source do not align. Therefore, we compute the difference between adjacent word indices and compare them across sources. If the patterns match, we have found a match between both sources and save their associated dataframe indices.
+
+    It turns out there is a perfect match between both sources and every observation in one source finds its counterpart in the other source.
+    """
+
+    getml_word_data = getml_word_data.to_pandas()
+
+    gnn_word_data = Planetoid(name="Cora", root="")
+
+    zuordnung = []
+    for getml_idx in getml_word_data["paper_id"].unique():
+        getml_positions = [
+            int(ele[4:])
+            for ele in getml_word_data[getml_word_data["paper_id"] == getml_idx][
+                "word_cited_id"
+            ].values
+        ]
+        getml_positions = np.sort(getml_positions)
+
+        getml_words_pattern = [
+            j - i for i, j in zip(getml_positions[:-1], getml_positions[1:])
+        ]
+
+        for gnn_idx in range(len(gnn_word_data[0].x)):
+            gnn_positions = [
+                i
+                for i, x in enumerate(
+                    [int(x) for x in list(gnn_word_data[0].x[gnn_idx])]
+                )
+                if x == 1
+            ]
+            gnn_words_pattern = [
+                j - i for i, j in zip(gnn_positions[:-1], gnn_positions[1:])
+            ]
+
+            if gnn_words_pattern == getml_words_pattern:
+                match = (int(getml_idx), gnn_idx)
+                zuordnung.append(match)
+                break
+
+
+    with open('assets/zuordnung.json', 'w') as file:
+        print("Writing to file")
+        json.dump(zuordnung, file)
+        print('Done')
+
+    print(zuordnung)
+