Script de generación de datos documentales TFM Daniel Herranz Segundo

In [130]:
import pandas as pd
import numpy as np
import os
import json
import random
import uuid

In [131]:
#Variables de los ficheros de datos salida
CurrentAccountCollection_file_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection.json'
PositionKeepingCollection_file_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection.json'
CustomerProfileCollection_file_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection.json'

CurrentAccountCollection_sample_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json'
PositionKeepingCollection_sample_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json'
CustomerProfileCollection_sample_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json'

In [132]:
#Función para resetear ficheros de salida
def reset_files(file):
    if os.path.exists(file):
        os.remove(file)
        print("The file", file ,"have been removed")
    else:
        print("The file", file ,"does not exist")

In [133]:
#Limpiamos los ficheros de salida
#reset_files(CurrentAccountCollection_file_out)
#reset_files(PositionKeepingCollection_file_out)
#reset_files(CustomerProfileCollection_file_out)
reset_files(CurrentAccountCollection_sample_out)
reset_files(PositionKeepingCollection_sample_out)
reset_files(CustomerProfileCollection_sample_out)

The file ../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json have been removed
The file ../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json have been removed
The file ../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json have been removed


# Generación coleccion CustomerProfile para MongoDB

In [134]:
#Carga de la información de dataframes por entidades

Address_df = pd.read_csv('../MockData/Address_1M.csv')
CustomerProfile_df = pd.read_csv('../MockData/CustomerProfile_1M.csv')
Country_df = pd.read_csv('../MockData/base/Country.csv')

In [135]:
print(Address_df.columns)
print(CustomerProfile_df.columns)
print(Country_df.columns)
print(len(Address_df))
print(len(CustomerProfile_df))
print(len(Country_df))

Index(['PartyId', 'AddressType', 'AddressLine', 'StreetName', 'BuildingNumber',
       'PostCode', 'TownName', 'CountrySubDivision', 'CountryId'],
      dtype='object')
Index(['PartyId', 'PartyNumber', 'PartyType', 'Name', 'FullLegalName',
       'LegalStructure', 'BeneficialOwnership', 'AccountRole', 'EmailAddress',
       'Phone'],
      dtype='object')
Index(['CountryId', 'ShortName', 'Description', 'Code', 'Unnamed: 4'], dtype='object')
1090853
1297441
3


In [136]:
#Indexado PartyId para mejor legibilidad
del(Country_df['CountryId'])
del(Country_df['Unnamed: 4'])

del(Address_df['PartyId'])
del(Address_df['CountryId'])

In [137]:
CustomerProfile_df = CustomerProfile_df.sample(10000)
Address_df = Address_df.sample(10000)
Country_df.sample(1)

Unnamed: 0,ShortName,Description,Code
0,Spain,Omnis dolore id dicta consequatur sit omnis. E...,IND


In [138]:
#Agregamos una columna de pais a las direcciones
CountryColumn = []

for i in range (0,len(Address_df)): 
    #CountryArrayAsJSON = json.loads(Country_df.sample(1).to_json(orient ='records'))
    country = {
        'ShortName': Country_df.sample(1)['ShortName'].item(),
        'Description': Country_df.sample(1)['Description'].item(),
        'Code': Country_df.sample(1)['Code'].item()
    }
    #print(country)
    CountryColumn.append(country)    

Address_df['Country'] = CountryColumn

In [139]:
Address_df.sample(1)

Unnamed: 0,AddressType,AddressLine,StreetName,BuildingNumber,PostCode,TownName,CountrySubDivision,Country
307316,East,44255 Alexandrea Run Suite 654,Waylon Turnpike,43430,97785-6489,West Malikaburgh,3.0,"{'ShortName': 'Greatbrit', 'Description': 'Aut..."


In [140]:
#Agreación de direcciones (Address) a la colección Customer Profile
#Tratamiento de las direcciones. Creamos de forma aleatoria entre 1 y 10 como agregado del CustomerProfile
AddressColumn = []

for i in range (0,len(CustomerProfile_df)): 
    AddressArrayAsJSON = json.loads(Address_df.sample(random.choice(range(1,3))).to_json(orient ='records'))
    address = {
        'AddressType': Address_df.sample(1)['AddressType'].item(),
        'AddressLine': Address_df.sample(1)['AddressLine'].item(),
        'StreetName': Address_df.sample(1)['StreetName'].item(),
        'BuildingNumber': Address_df.sample(1)['BuildingNumber'].item(),
        'PostCode': Address_df.sample(1)['PostCode'].item(),
        'TownName': Address_df.sample(1)['TownName'].item(),
        'CountrySubDivision': Address_df.sample(1)['CountrySubDivision'].item()        
    }
    AddressColumn.append(address)    
    
CustomerProfile_df['Address'] = AddressColumn

In [142]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    CustomerProfile_df = CustomerProfile_df.append(CustomerProfile_df.sample(frac=0.65,replace = True))

len(CustomerProfile_df)

11086020

In [143]:
#Volcamos 5M de registros para su uso posterior
CustomerProfile_df = CustomerProfile_df.sample(1000000)
len(CustomerProfile_df)

1000000

In [144]:
#Modificamos la clave unida PartyId con uuid
CustomerProfile_df['PartyId'] = CustomerProfile_df['PartyId'].map(lambda x: str(uuid.uuid1()))

In [145]:
CustomerProfile_df.sample(1)

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone,Address
738832,f5b87e22-bb34-11ec-bad7-8faeae2e42b2,4284,sol,Claire,Keagan Thiel,Ms.,7,US.OBIE.Principal,scarlett61@example.org,210.264.3092,"{'AddressType': 'Lake', 'AddressLine': '5074 E..."


In [146]:
#Volcado fichero JSON
#CustomerProfile_df.to_json(CustomerProfileCollection_file_out,orient ='records') #Final file
#print("Guardado fichero en", CustomerProfileCollection_file_out)

#Volcado fichero JSON reducido para pruebas (Sample)
CustomerProfile_df.to_json(CustomerProfileCollection_sample_out,orient ='records') #Sample for testing
print("Guardado fichero en", CustomerProfileCollection_sample_out)

Guardado fichero en ../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json


# Generación colección CurrentAccount para MongoDB

In [147]:
#Carga de datasets necesarias para la colección
AccountInfo_df = pd.read_csv('../MockData/AccountInfo_1M.csv')
CurrentAccount_df = pd.read_csv('../MockData/CurrentAccount_1M.csv')

In [148]:
AccountInfo_df.columns

Index(['AccountInfoId', 'AccountId', 'SchemeName', 'Identification', 'Name'], dtype='object')

In [149]:
#Limpieza de columnas residuales y no necesarias en el documento
del(AccountInfo_df['AccountInfoId'])
del(AccountInfo_df['AccountId'])

In [150]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType
331626,3692,3571,Disabled,2016-08-31 02:33:32.00,Business,extend extensible relationships,2008-04-29 20:17:26.00,EDP


In [151]:
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
124985,UK.business,1.0,Considine-Renner


In [152]:
#Creamos un sample. Testing (comentar en generación real)
CurrentAccount_df = CurrentAccount_df.sample(10000)
CurrentAccount_df['AccountId'] = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))
AccountInfo_df = AccountInfo_df.sample(10000)

In [153]:
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
922959,,5.0,"Bernier, Crooks and Skiles"


In [154]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType
259331,ffa031be-bb34-11ec-bad7-8faeae2e42b2,7998,Disabled,1996-03-26 06:12:20.00,Particular,embrace cross-media e-markets,1973-08-10 23:58:46.00,SAI


In [165]:
#identify duplicate rows across 'team' and 'points' columns
duplicateRows = CurrentAccount_df[CurrentAccount_df.duplicated(['AccountId'])]

#view duplicate rows
print(duplicateRows)

Empty DataFrame
Columns: [AccountId, PartyId, Status, StatusUpdateDateTime, AccountType, NickName, OpeningDate, AccountSubType]
Index: []


In [166]:
#Generamos la colección agregando AccountInfo a las cuentas

AccountInfoColumn = []
CurrentAccountCollection = pd.DataFrame()

len(CurrentAccount_df)

for i in range (0,len(CurrentAccount_df)):
    account_info_ls = []
    for j in range(1,3):
        accountInfo = {
            'SchemeName' : AccountInfo_df.sample(1)['SchemeName'].item(),                    
            'Identification': AccountInfo_df.sample(1)['Identification'].item(),   
            'Name': AccountInfo_df.sample(1)['Name'].item()
        }
        account_info_ls.append(accountInfo)
    AccountInfoColumn.append(account_info_ls)    
    
CurrentAccount_df['AccountInfo'] = AccountInfoColumn


In [167]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
797320,ff9c3f1e-bb34-11ec-bad7-8faeae2e42b2,5757,Enabled,1992-02-26 06:09:16.00,Business,visualize back-end content,2019-02-10 21:13:52.00,POI,"[{'SchemeName': 'UK.business', 'Identification..."


In [169]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    CurrentAccount_df = CurrentAccount_df.append(CurrentAccount_df.sample(frac=0.65,replace = True))

len(CurrentAccount_df)

11086020

In [171]:
#Volcamos 5M de registros para su uso posterior
CurrentAccount_df = CurrentAccount_df.sample(1000000)
CurrentAccount_df['AccountId'] = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))
len(CurrentAccount_df)

1000000

In [172]:
print(CurrentAccount_df.index.is_unique)
print(CurrentAccount_df.index.duplicated())
CurrentAccount_df.drop_duplicates(subset=['AccountId'], keep='last')
CurrentAccount_df.reset_index()
print(CurrentAccount_df.index.is_unique)
print(CurrentAccount_df.index.duplicated())
len(CurrentAccount_df)

False
[False False False ...  True  True  True]
False
[False False False ...  True  True  True]


1000000

In [173]:
CustomerProfile_df
print(CustomerProfile_df.index.is_unique)
print(CustomerProfile_df.index.duplicated())
CustomerProfile_df.drop_duplicates(subset=['PartyId'], keep='last')
CustomerProfile_df.reset_index()
print(CustomerProfile_df.index.is_unique)
print(CustomerProfile_df.index.duplicated())
len(CustomerProfile_df)

False
[False False False ...  True  True  True]
False
[False False False ...  True  True  True]


1000000

In [174]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
1131818,a17108b0-bb35-11ec-bad7-8faeae2e42b2,271,Enabled,1984-12-31 21:19:13.00,Business,architect real-time metrics,1977-07-16 01:36:15.00,POI,"[{'SchemeName': nan, 'Identification': 4.0, 'N..."


In [175]:
CurrentAccount_df['PartyId'] = CustomerProfile_df['PartyId'].values
CurrentAccount_df.sample(1)


Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
454540,a1eeb5b2-bb35-11ec-bad7-8faeae2e42b2,f6acf02e-bb34-11ec-bad7-8faeae2e42b2,Disabled,1976-08-15 18:51:53.00,Business,streamline back-end e-business,2017-01-08 12:58:49.00,POI,"[{'SchemeName': nan, 'Identification': nan, 'N..."


In [176]:
#Volcado fichero JSON
CurrentAccount_df.to_json(CurrentAccountCollection_sample_out,orient ='records') #Sample for testing

#CurrentAccount_df.to_json(CurrentAccountCollection_file_out,orient ='records')

# Generación colección PositionKeeping para MongoDB

In [177]:
#Carga de datasets necesarias para la colección
Amount_df = pd.read_csv('../MockData/Amount_1M.csv')
CreditLine_df = pd.read_csv('../MockData/CreditLine_1M.csv')
Currency_df = pd.read_csv('../MockData/base/Currency.csv')
PositionKeeping_df = pd.read_csv('../MockData/Position_Keeping_1M.csv')

In [178]:
#Verificación de columnas
print(Amount_df.columns)
print(Country_df.columns)
print(CreditLine_df.columns)
print(Currency_df.columns)
print(PositionKeeping_df.columns)

Index(['AmountId', 'CurrencyId', 'Amount'], dtype='object')
Index(['ShortName', 'Description', 'Code'], dtype='object')
Index(['CreditLineId', 'CurrencyId', 'Amount', 'Type', 'Included'], dtype='object')
Index(['CurrencyId', 'Code', 'Description', 'Unnamed: 3'], dtype='object')
Index(['AccountId', 'DateTime', 'CreditDebitIndicator', 'Type', 'AmountId',
       'CreditLineId'],
      dtype='object')


In [179]:
#Limpieza de columnas residuales y no necesarias en el documento
del(Amount_df['CurrencyId'])
del(Amount_df['AmountId'])

del(CreditLine_df['CreditLineId'])
del(CreditLine_df['CurrencyId'])

del(Currency_df['CurrencyId'])

del(PositionKeeping_df['CreditLineId'])
del(PositionKeeping_df['AmountId'])
del(PositionKeeping_df['AccountId'])

In [180]:
#Creamos samples. Testing (comentar en generación real)
Amount_df = Amount_df.sample(10000)
CreditLine_df = CreditLine_df.sample(10000)
PositionKeeping_df = PositionKeeping_df.sample(10000)

In [181]:
PositionKeeping_df.sample(3)

Unnamed: 0,DateTime,CreditDebitIndicator,Type
385500,1985-12-03 18:53:30.00,Debit,Business
983262,1982-04-01 05:42:11.00,Debit,Business
405199,1973-05-16 21:38:09.00,Debit,Particular


In [182]:
CreditLine_df.sample(1)['Amount']

93202    2419582.734
Name: Amount, dtype: float64

In [183]:
Currency_df.sample(1)

Unnamed: 0,Code,Description,Unnamed: 3
2,EUR,Dolores sint vero vero maiores nisi. Et quas i...,


In [184]:
#Generamos la colección PositionKeeping

AmountColumn = []
CreditLineColumn = []

for i in range (0,len(PositionKeeping_df)):    
    valueAmount = { 'Currency': {
                        'Code': Currency_df.sample(1)['Code'].item(),
                        'Description': Currency_df.sample(1)['Description'].item(),
                },
               'Amount' : round(random.uniform(-3000, 99999999),2)
              }
    AmountColumn.append(valueAmount)  

    valueCreditLine = {
        'Included' : CreditLine_df.sample(1)['Included'].item(),                    
        'Type': CreditLine_df.sample(1)['Type'].item(),
        'Amount': { 'Currency': {
                        "Code": Currency_df.sample(1)['Code'].item(),
                        "Description": Currency_df.sample(1)['Description'].item(),
                },
               'Amount' : round(random.uniform(-3000, 99999999),2)
              }
    }
    CreditLineColumn.append(valueCreditLine)


PositionKeeping_df['Amount'] = AmountColumn
PositionKeeping_df['CreditLine'] = CreditLineColumn

In [185]:
len(PositionKeeping_df)

10000

In [186]:
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine
872810,2019-02-17 05:41:22.00,Debit,Particular,"{'Currency': {'Code': 'USA', 'Description': 'U...","{'Included': 0, 'Type': nan, 'Amount': {'Curre..."


In [187]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    PositionKeeping_df = PositionKeeping_df.append(PositionKeeping_df.sample(frac=0.65,replace = True))

#Volcamos 5M de registros para su uso posterior
PositionKeeping_df = PositionKeeping_df.sample(1000000)
len(PositionKeeping_df)

1000000

In [188]:
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine
256387,1978-03-01 11:51:46.00,Credit,Business,"{'Currency': {'Code': 'EUR', 'Description': 'U...","{'Included': 0, 'Type': 'Business', 'Amount': ..."


In [189]:
PositionKeeping_df['AccountId'] = CurrentAccount_df['AccountId'].values
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine,AccountId
181648,2007-10-25 09:51:20.00,Debit,Business,"{'Currency': {'Code': 'EUR', 'Description': 'D...","{'Included': 0, 'Type': nan, 'Amount': {'Curre...",a1f8ccbe-bb35-11ec-bad7-8faeae2e42b2


In [190]:
#Volcado fichero JSON
PositionKeeping_df.to_json(PositionKeepingCollection_sample_out,orient ='records') #Final file
#PositionKeeping_df.to_json(PositionKeepingCollection_file_out,orient ='records') #Final file
