In [1]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase

In [2]:
# Reading back the saved file
csv_file_path = "data/madelon_full.csv"
data = pd.read_csv(csv_file_path)

# Displaying the first few rows of the combined data
print(data)

        0    1    2    3    4    5    6    7    8    9  ...  491  492  493  \
0     485  477  537  479  452  471  491  476  475  473  ...  481  477  485   
1     483  458  460  487  587  475  526  479  485  469  ...  478  487  338   
2     487  542  499  468  448  471  442  478  480  477  ...  481  492  650   
3     480  491  510  485  495  472  417  474  502  476  ...  480  474  572   
4     484  502  528  489  466  481  402  478  487  468  ...  479  452  435   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2595  493  458  503  478  517  479  472  478  444  477  ...  475  485  443   
2596  481  484  481  490  449  481  467  478  469  483  ...  485  508  599   
2597  485  485  530  480  444  487  462  475  509  494  ...  474  502  368   
2598  477  469  528  485  483  469  482  477  494  476  ...  476  453  638   
2599  482  453  515  481  500  493  503  477  501  475  ...  478  487  694   

      494  495  496  497  498  499  target  
0     511  485  48

In [3]:
np.random.seed(42)
data = pd.DataFrame(np.random.rand(2600, 20), columns=[f'feature_{i}' for i in range(20)])
data['target'] = np.random.choice([0, 1], size=(2600,), p=[0.5, 0.5])  # Balanced classes

# Separate the data by class
class_0 = data[data['target'] == 0]
class_1 = data[data['target'] == 1]

# Create the subdataset with 800 samples (400 samples per class)
subdataset_8000 = pd.concat([class_0.sample(400, random_state=42), 
                            class_1.sample(400, random_state=42)])

# Create the subdataset with 1600 samples (800 samples per class)
subdataset_14000 = pd.concat([class_0.sample(800, random_state=42), 
                             class_1.sample(800, random_state=42)])

# The full dataset is the entire Madelon dataset (2600 samples)
subdataset_full = data

# Display the class distributions for each subdataset
print("Subdataset 800 samples class distribution:\n", subdataset_800['target'].value_counts())
print("Subdataset 1600 samples class distribution:\n", subdataset_1600['target'].value_counts())
print("Full dataset class distribution:\n", subdataset_full['target'].value_counts())

Subdataset 800 samples class distribution:
 target
0    400
1    400
Name: count, dtype: int64
Subdataset 1600 samples class distribution:
 target
0    800
1    800
Name: count, dtype: int64
Full dataset class distribution:
 target
1    1310
0    1290
Name: count, dtype: int64


In [4]:
# "First Create an empty database, within a project"

uri = "bolt://localhost:7687"
username = "neo4j" # username for Neo4j, most probably is Neo4j, if you did not change while installing Neo4j Desktop
password = "eigen1234" # Password for database.
database_name = "d3.madelon"  # Database name

driver = GraphDatabase.driver(uri, auth=(username, password))

In [5]:
def check_connection():
    try:
        # Establish a session with the specified database
        with driver.session(database=database_name) as session:
            # Run a simple query to check the connection
            result = session.run("RETURN 'Connection to database successful' AS message")
            for record in result:
                print(record["message"])
    except Exception as e:
        print("Error connecting to the database:", e)

# Call the check_connection function
check_connection()

Connection to database successful


In [6]:
# Function to create nodes in the specified database with a dynamic label
def create_nodes(data, driver, label):
    try:
        # Establish a session with the specified database
        with driver.session(database=database_name) as session:
            for _, row in data.iterrows():
                # Create a dynamic dictionary for properties
                properties = row.to_dict()
                # Prepare the Cypher query with a dynamic label
                query = f"CREATE (n:{label} {{"
                query += ', '.join([f"{key}: ${key}" for key in properties.keys()])
                query += "})"
                
                # Run the query with properties
                session.run(query, **properties)

    except Exception as e:
        print("Error during node creation:", e)
    finally:
        # Close the driver
        driver.close()

In [7]:
# For subdataset_800
create_nodes(subdataset_800, driver, 'Dataset3_800_madelon')

# For subdataset_1600
create_nodes(subdataset_1600, driver, 'Dataset3_1600_madelon')

# For the full dataset (2600 samples)
create_nodes(subdataset_full, driver, 'Dataset3_full_madelon')


  with driver.session(database=database_name) as session:
