In [2]:
# !pip install PyQt5
# ! git clone --recursive https://github.com/dmlc/xgboost
# !pip install pandastable
# !pip install -U numpy scipy py_entitymatching
# !pip install javaobj-py3

Collecting javaobj-py3
  Downloading javaobj_py3-0.4.3-py2.py3-none-any.whl (57 kB)
     ---------------------------------------- 0.0/57.3 kB ? eta -:--:--
     --------------------------- ---------- 41.0/57.3 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 57.3/57.3 kB 747.3 kB/s eta 0:00:00
Installing collected packages: javaobj-py3
Successfully installed javaobj-py3-0.4.3


In [34]:
# ------------- PARAMETER -------------

FOLDER_DATA = 'data/'
FOLDER_DATA_cleaner = FOLDER_DATA + 'cleanCleanErDatasets/'
FOLDER_DATA_dirty = FOLDER_DATA + 'dirtyErDatasets/'
FOLDER_DATA_csv = FOLDER_DATA + 'csv/'

In [154]:
import javaobj
import csv
import pandas as pd
class JSOFileReader:
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.DataFrame()
    
    def __read_data_set__(self, attr):
        try:
            row = dict()
            if isinstance(attr, set):
                for ins in set(attr):
                    class_desc = ins.classdesc
                    key = ''
                    for field_desc in class_desc.fields:   
                        attr_name = field_desc.name
                        if attr_name == 'name':
                            key = getattr(ins, attr_name)
                        if attr_name == 'value':
                            attr_value = getattr(ins, attr_name)
                            row[key]=attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a dictionary data")
        except TypeError as msg:
            print(msg)
    def __read_data_scala__(self, attr_value, attr_name):
        try:
            row = dict()
            if isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
                row[attr_name] = attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a scala data")
        except TypeError as msg:
            print(msg)
    def read_file(self):
        with open(self.filename, 'rb') as f:
            data = javaobj.v2.load(f)
        return data
    
    def to_pandas_df(self):
        jso_raw = self.read_file()
        
        df = pd.DataFrame()
        for obj in jso_raw:
            for key, value in obj.__dict__.items():
                if key == 'classdesc':
                    field_names = [field.name for field in value.fields]
                    field_values = []
                    pd_row = pd.DataFrame()
                    for field in value.fields:
                        attr_value = getattr(obj, field.name)
#                         print(type(attr_value))
                        if isinstance(attr_value, set):
                            attr_set = self.__read_data_set__(attr_value)
                            pd_row = pd.concat([pd_row, attr_set], axis=1)
                        elif isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
#                             print(attr_value)
                            attr_scala = self.__read_data_scala__(attr_value,  field.name)
                            pd_row = pd.concat([pd_row, attr_scala], axis=1)
                    df = pd.concat([df, pd_row], ignore_index=True)
        self.df = df
        return df
    
    
    def to_csv(self):
        import os  
        os.makedirs('FOLDER_DATA_csv', exist_ok=True)  
        
        filename = self.filename.split('/')
        try:
            if self.df.empty:
                raise ValueError('Please call to_pandas_df() before to_csv function!')
        except ValueError as msg:
            print('Error with file' + self.filename + ' ' +  str(msg.args))
            return
        
#         print(filename)
        self.df.to_csv(FOLDER_DATA_csv + filename[-1] + '.csv', index=False)  

In [None]:
class ProcessAllJSO:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.__process__()
    def __process__(self):
        import os

        files = os.listdir(self.folder_path)
        print('Process folder ' + self.folder_path)
        for file_name in files:
            print('Processing: ' + file_name)
            reader = JSOFileReader(self.folder_path + file_name)
            reader.to_pandas_df()
            reader.to_csv()

# Convert JSO file in FOLDER_DATA_cleaner to CSV

In [None]:
ProcessAllJSO(FOLDER_DATA_cleaner)

Process folderdata/cleanCleanErDatasets/
Processing: abtBuyIdDuplicates
Processing: abtProfiles
Processing: acmProfiles
Processing: amazonGpIdDuplicates
Processing: amazonProfiles
Processing: amazonProfiles2
Processing: amazonWalmartIdDuplicates
Processing: buyProfiles
Processing: dblpAcmIdDuplicates
Processing: dblpProfiles
Processing: dblpProfiles2
Processing: dblpScholarIdDuplicates
Processing: gpProfiles
Processing: imdbProfiles
Processing: imdbProfilesNEW
Processing: imdbTmdbIdDuplicates
Processing: imdbTvdbIdDuplicates
Processing: moviesIdDuplicates
Processing: restaurant1Profiles
Processing: restaurant2Profiles
Processing: restaurantsIdDuplicates
Processing: scholarProfiles


## Example of using JSOFileReader

In [157]:
reader = JSOFileReader(FOLDER_DATA_cleaner + 'abtProfiles')
df = reader.to_pandas_df()
df

Unnamed: 0,name,description,entityUrl,price
0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,552,
1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,580,399
2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,4696,49
3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,5644,
4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,6284,158
...,...,...,...,...
1071,Logitech Cordless Desktop Wave Keyboard And Mo...,Logitech Cordless Desktop Wave Keyboard And Mo...,39088,79
1072,Mitsubishi DLP Black TV Stand - MBS73V,Mitsubishi DLP Black TV Stand - MBS73V/ Matchi...,39090,549
1073,Logitech Digital Precision PC Gaming Headset -...,Logitech Digital Precision PC Gaming Headset -...,39175,49
1074,Logitech 2.1 Multimedia Silver Speaker System ...,Logitech 2.1 Multimedia Silver Speaker System ...,39176,
