In [1]:
# !pip install PyQt5
# ! git clone --recursive https://github.com/dmlc/xgboost
# !pip install pandastable
# !pip install -U numpy scipy py_entitymatching
# !pip install py_stringmatching
# ! pip install  -U "numpy<1.24.0"
# !pip install javaobj-py3

In [2]:
import importlib.metadata
print(importlib.metadata.version('numpy'))

1.23.5


In [3]:
# ------------- PARAMETER -------------

FOLDER_DATA = 'data/'
FOLDER_DATA_cleaner = FOLDER_DATA + 'cleanCleanErDatasets/'
FOLDER_DATA_dirty = FOLDER_DATA + 'dirtyErDatasets/'
FOLDER_DATA_csv = FOLDER_DATA + 'csv/'

In [7]:
import javaobj.v2
import csv
import pandas as pd
class JSOFileReader:
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.DataFrame()
    
    def __read_data_set__(self, attr):
        try:
            row = dict()
            if isinstance(attr, set):
                for ins in set(attr):
                    class_desc = ins.classdesc
                    key = ''
                    for field_desc in class_desc.fields:   
                        attr_name = field_desc.name
                        if attr_name == 'name':
                            key = getattr(ins, attr_name)
                        if attr_name == 'value':
                            attr_value = getattr(ins, attr_name)
                            row[key]=attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a dictionary data")
        except TypeError as msg:
            print(msg)
    def __read_data_scala__(self, attr_value, attr_name):
        try:
            row = dict()
            if isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
                row[attr_name] = attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a scala data")
        except TypeError as msg:
            print(msg)
    def read_file(self):
        with open(self.filename, 'rb') as f:
            data = javaobj.v2.load(f)
        return data
    
    def to_pandas_df(self):
        jso_raw = self.read_file()
        
        df = pd.DataFrame()
        for obj in jso_raw:
            for key, value in obj.__dict__.items():
                if key == 'classdesc':
                    field_names = [field.name for field in value.fields]
                    field_values = []
                    pd_row = pd.DataFrame()
                    for field in value.fields:
                        attr_value = getattr(obj, field.name)
#                         print(type(attr_value))
                        if isinstance(attr_value, set):
                            attr_set = self.__read_data_set__(attr_value)
                            pd_row = pd.concat([pd_row, attr_set], axis=1)
                        elif isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
#                             print(attr_value)
                            attr_scala = self.__read_data_scala__(attr_value,  field.name)
                            pd_row = pd.concat([pd_row, attr_scala], axis=1)
                    df = pd.concat([df, pd_row], ignore_index=True)
        self.df = df
        return df
    
    
    def to_csv(self):
        import os  
        os.makedirs('FOLDER_DATA_csv', exist_ok=True)  
        
        filename = self.filename.split('/')
        try:
            if self.df.empty:
                raise ValueError('Please call to_pandas_df() before to_csv function!')
        except ValueError as msg:
            print('Error with file' + self.filename + ' ' +  str(msg.args))
            return
        
#         print(filename)
        self.df.insert(0, 'id', self.df.index)
        self.df.to_csv(FOLDER_DATA_csv + filename[-1] + '.csv', index=False)  

In [29]:
class ProcessAllJSO:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.__process__()
    def __process__(self):
        import os

        files = os.listdir(self.folder_path)
        print('Process folder ' + self.folder_path)
        for file_name in files:
            print('Processing: ' + file_name)
            reader = JSOFileReader(self.folder_path + file_name)
            reader.to_pandas_df()
            reader.to_csv()

# Convert JSO file in FOLDER_DATA_cleaner to CSV

In [30]:
ProcessAllJSO(FOLDER_DATA_cleaner)

Process folder data/cleanCleanErDatasets/
Processing: abtBuyIdDuplicates
Processing: abtProfiles
Processing: acmProfiles
Processing: amazonGpIdDuplicates
Processing: amazonProfiles
Processing: amazonProfiles2
Processing: amazonWalmartIdDuplicates
Processing: buyProfiles
Processing: dblpAcmIdDuplicates
Processing: dblpProfiles
Processing: dblpProfiles2
Processing: dblpScholarIdDuplicates
Processing: gpProfiles
Processing: imdbProfiles
Processing: imdbProfilesNEW
Processing: imdbTmdbIdDuplicates
Processing: imdbTvdbIdDuplicates
Processing: moviesIdDuplicates
Processing: restaurant1Profiles
Processing: restaurant2Profiles
Processing: restaurantsIdDuplicates
Processing: scholarProfiles
Processing: tmdbProfiles
Processing: tmdbTvdbIdDuplicates
Processing: tvdbProfiles
Processing: walmartProfiles


<__main__.ProcessAllJSO at 0x25b7a0deac0>

## Example of using JSOFileReader

In [10]:
reader = JSOFileReader(FOLDER_DATA_dirty + 'abtBuyIdDuplicates')
df = reader.to_pandas_df()
df

Unnamed: 0,entityId1,entityId2
0,477,1501
1,1067,1687
2,889,2124
3,135,1219
4,62,1135
...,...,...
1071,973,2110
1072,950,1974
1073,783,1499
1074,416,1239


In [11]:
reader.to_csv()