In [1]:
from IPython.display import display
from collections import Counter
import mysql.connector
from mysql.connector import Error
import csv
import re
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from pandas.errors import ParserError

import math
import warnings
warnings.filterwarnings('ignore') # setting ignore as a parameter

In [2]:
host = 'relational.fel.cvut.cz'
schema = 'pubs'
user = 'guest'
password = 'ctu-relational'
neo_uri='bolt://localhost:7687'
neo_user='neo4j'
neo_pass='P@SSw0rd'
neo_db='pubs'

In [3]:
class Neo4jDB:
    
    def __init__(self, uri, user, pwd='', default_db=None):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        self.__default_db = default_db
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
            if self.__default_db:
                self.ensure_database(self.__default_db)
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None):
        """Executes a query on the default database."""
        assert self.__driver is not None, "Driver not initialized!"
        assert self.__default_db is not None, "Default database not set!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=self.__default_db)
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

    def ensure_database(self, db_name):
        """Ensures the specified database exists and sets it as the default."""
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        try:
            session = self.__driver.session(database="system")
            # Check if the database exists
            existing_databases = [record["name"] for record in session.run("SHOW DATABASES")]
            if db_name not in existing_databases:
                session.run(f"CREATE DATABASE {db_name}")
                print(f"Database '{db_name}' created successfully.")
            else:
                print(f"Database '{db_name}' already exists.")
            
            # Verify that the database exists
            existing_databases = [record["name"] for record in session.run("SHOW DATABASES")]
            if db_name in existing_databases:
                self.__default_db = db_name
                print(f"Switched to database '{db_name}' as the default.")
            else:
                raise Exception(f"Database '{db_name}' could not be verified after creation.")
        except Exception as e:
            print("Failed to ensure database existence or switch to it:", e)
        finally:
            if session is not None:
                session.close()

neoConn = Neo4jDB(uri=neo_uri, user=neo_user, pwd=neo_pass, default_db=neo_db)

Database 'pubs' already exists.
Switched to database 'pubs' as the default.


In [4]:
import mysql.connector
from mysql.connector import Error

try:
    connection = mysql.connector.connect(host=host, database=schema, user=user, password=password)
    
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("You're connected to database: ", record)

except Error as e:
    print("Error while connecting to MySQL", e)

Connected to MySQL Server version  5.5.5-10.6.12-MariaDB-1:10.6.12+maria~ubu2004-log
You're connected to database:  ('pubs',)


In [5]:
import math
import mmh3
from bitarray import bitarray

class BloomFilter(object):

    '''
    Class for Bloom filter, using murmur3 hash function
    '''

    def __init__(self, items_count, fp_prob):
        '''
        items_count : int
            Number of items expected to be stored in bloom filter
        fp_prob : float
            False Positive probability in decimal
        '''
        # False possible probability in decimal
        self.fp_prob = fp_prob

        # Size of bit array to use
        self.size = self.get_size(items_count, fp_prob)

        # number of hash functions to use
        self.hash_count = self.get_hash_count(self.size, items_count)

        # Bit array of given size
        self.bit_array = bitarray(self.size)

        # initialize all bits as 0
        self.bit_array.setall(0)

    def add(self, item):
        '''
        Add an item in the filter
        '''
        digests = []
        for i in range(self.hash_count):

            # create digest for given item.
            # i work as seed to mmh3.hash() function
            # With different seed, digest created is different
            digest = mmh3.hash(item, i) % self.size
            digests.append(digest)

            # set the bit True in bit_array
            self.bit_array[digest] = True

    def check(self, item):
        '''
        Check for existence of an item in filter
        '''
        for i in range(self.hash_count):
            digest = mmh3.hash(item, i) % self.size
            if self.bit_array[digest] == False:

                # if any of bit is False then,its not present
                # in filter
                # else there is probability that it exist
                return False
        return True

    @classmethod
    def get_size(self, n, p):
        '''
        Return the size of bit array(m) to used using
        following formula
        m = -(n * lg(p)) / (lg(2)^2)
        n : int
            number of items expected to be stored in filter
        p : float
            False Positive probability in decimal
        '''
        m = -(n * math.log(p))/(math.log(2)**2)
        return int(m)

    @classmethod
    def get_hash_count(self, m, n):
        '''
        Return the hash function(k) to be used using
        following formula
        k = (m/n) * lg(2)

        m : int
            size of bit array
        n : int
            number of items expected to be stored in filter
        '''
        k = (m/n) * math.log(2)
        return int(k)

In [6]:
def check_bf(data_mysql, data_neo4j, bloomf):
    #print("==============================================DATA====================================================")
    #print(data_mysql)
    #print("------------------------------------------------------------------------------------------------------")
    #print(data_neo4j)
    #print("======================================================================================================")
    
    for item in data_mysql:
        bloomf.add(item)
    
    match = True
    for d in data_neo4j:
        if bloomf.check(d):
            if d not in data_mysql:
                print("'{}' is false positive (it does not actually in MySQL)!".format(d))
                match = False
            #else:
            #    print("'{}' is probably present!".format(d))
        else:
            print("'{}' is found in Neo4j but not in MySQL!".format(d))
            match = False
            
    return match

In [7]:
def _solve_fields(table, field_names):
    s = ''
    first = True
    for f in field_names:
        if first:
            first = False
        else:
            s += ','            
        s +='{}.{}'.format(table,f)
    return s

In [8]:
def _read_csv(table, s, e):
    file = 'db.localhost/{}/{}.csv'.format(schema, table)
    #print("file: {}".format(file))
    return pd.read_csv (file, sep = s, encoding = e)  

In [9]:
from pandas import DataFrame
import pandas as pd
import csv
import time

start_time = time.time()

q1 = ("SHOW TABLES FROM " + schema)
c1 = connection.cursor(dictionary=True, buffered=True)
c1.execute(q1)

table_list = c1.fetchall()
c2 = connection.cursor()
p = 0.01
df = pd.DataFrame(columns=['Table', 'Records', 'p', 'm', 'k', 'Time(mins)'])
for entry in table_list:
    st = time.time()
    _, table = entry.popitem()
    records = _read_csv(table, ';', 'utf-8')
    field_names = list(records.columns)
    fields = _solve_fields(table, field_names)

    data_mysql = []
    data_neo4j = []

    first = True
    for i, r in records.iterrows():
        s = ''.join(str(d) for d in r)
        s = s.replace('None','')
        s = s.replace('nan','')
        s = s.replace('.0','')
        data_mysql.append(s)

    q3 = "MATCH ({}:{}) RETURN {}".format(table, table.capitalize(), fields)
    #print(q3)
    result2 = neoConn.query(q3)
    for r in result2:
        s = ''.join(str(d) for d in [str(a) for a in r])
        s = s.replace('None','')
        s = s.replace('nan','')
        s = s.replace('.0','')
        data_neo4j.append(s)

    n = len(data_mysql)
    bloomf = BloomFilter(n,p)
    result3 = check_bf(data_mysql, data_neo4j, bloomf)
    if result3:
        print("====> All nodes in Neo4j and records in MySQL are matched!")
    else:
        print("====> Nodes in Neo4j and records in MySQL DO NOT matched!")
        
    et = time.time() - st
    mn = "{:.5f}".format(et / 60)
    
    df = df.append({'Table':table.capitalize(), 'Records':n, 'p':bloomf.fp_prob, 'm':bloomf.size, 'k':bloomf.hash_count, 'Time(mins)':mn}, ignore_index=True)
    #print("Table: {}, Records: {}, Bloom Filter (m:{},p:{},k:{}), Time: {} mins".format(table,n,bloomf.size,bloomf.fp_prob,bloomf.hash_count,mn))

elapsed_time = time.time() - start_time
mins = "{:.5f}".format(elapsed_time / 60)
#st = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))    
print("Elapsed Time: {} mins".format(mins))
df

====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
====> All nodes in Neo4j and records in MySQL are matched!
Elapsed Time: 0.02793 mins


Unnamed: 0,Table,Records,p,m,k,Time(mins)
0,Authors,23,0.01,220,6,0.00167
1,Discounts,3,0.01,28,6,0.00117
2,Employee,43,0.01,412,6,0.00184
3,Jobs,14,0.01,134,6,0.00208
4,Pub_info,8,0.01,76,6,0.00144
5,Publishers,8,0.01,76,6,0.00141
6,Roysched,86,0.01,824,6,0.00138
7,Sales,21,0.01,201,6,0.00137
8,Stores,6,0.01,57,6,0.00127
9,Titleauthor,25,0.01,239,6,0.0012
