Script de generación de datos documentales TFM Daniel Herranz Segundo

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
import uuid

In [2]:
#Variables de los ficheros de datos salida
CurrentAccountCollection_file_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection.json'
PositionKeepingCollection_file_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection.json'
CustomerProfileCollection_file_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection.json'

CurrentAccountCollection_sample_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json'
PositionKeepingCollection_sample_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json'
CustomerProfileCollection_sample_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json'

In [3]:
#Función para resetear ficheros de salida
def reset_files(file):
    if os.path.exists(file):
        os.remove(file)
        print("The file", file ,"have been removed")
    else:
        print("The file", file ,"does not exist")

In [4]:
#Limpiamos los ficheros de salida
#reset_files(CurrentAccountCollection_file_out)
#reset_files(PositionKeepingCollection_file_out)
#reset_files(CustomerProfileCollection_file_out)
reset_files(CurrentAccountCollection_sample_out)
reset_files(PositionKeepingCollection_sample_out)
reset_files(CustomerProfileCollection_sample_out)

The file ../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json does not exist
The file ../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json does not exist
The file ../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json have been removed


# Generación coleccion CustomerProfile para MongoDB

In [5]:
#Carga de la información de dataframes por entidades

Address_df = pd.read_csv('../MockData/Address_1M.csv')
CustomerProfile_df = pd.read_csv('../MockData/CustomerProfile_1M.csv')
Country_df = pd.read_csv('../MockData/base/Country.csv')

In [6]:
print(Address_df.columns)
print(CustomerProfile_df.columns)
print(Country_df.columns)
print(len(Address_df))
print(len(CustomerProfile_df))
print(len(Country_df))

Index(['PartyId', 'AddressType', 'AddressLine', 'StreetName', 'BuildingNumber',
       'PostCode', 'TownName', 'CountrySubDivision', 'CountryId'],
      dtype='object')
Index(['PartyId', 'PartyNumber', 'PartyType', 'Name', 'FullLegalName',
       'LegalStructure', 'BeneficialOwnership', 'AccountRole', 'EmailAddress',
       'Phone'],
      dtype='object')
Index(['CountryId', 'ShortName', 'Description', 'Code', 'Unnamed: 4'], dtype='object')
1090853
1297441
3


In [7]:
#Indexado PartyId para mejor legibilidad
del(Country_df['CountryId'])
del(Country_df['Unnamed: 4'])

del(Address_df['PartyId'])
del(Address_df['CountryId'])

In [8]:
CustomerProfile_df = CustomerProfile_df.sample(10000)
Address_df = Address_df.sample(10000)

In [9]:
#Agregamos una columna de pais a las direcciones
CountryColumn = []

for i in range (0,len(Address_df)): 
    CountryArrayAsJSON = json.loads(Country_df.sample(1).to_json(orient ='records'))
    CountryColumn.append(CountryArrayAsJSON)    

Address_df['Country'] = CountryColumn

#Agreación de direcciones (Address) a la colección Customer Profile
#Tratamiento de las direcciones. Creamos de forma aleatoria entre 1 y 10 como agregado del CustomerProfile
AddressColumn = []

for i in range (0,len(CustomerProfile_df)): 
    AddressArrayAsJSON = json.loads(Address_df.sample(random.choice(range(1,3))).to_json(orient ='records'))
    AddressColumn.append(AddressArrayAsJSON)    
    
CustomerProfile_df['Address'] = AddressColumn
CustomerProfile_df.sample(2)

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone,Address
1248977,7285,7285,sol,Simone,Mr. Ricardo Reichert II,Mr.,3,UK.OBIE.Secundary,pbechtelar@example.com,07482100005,"[{'AddressType': 'Port', 'AddressLine': '218 L..."
396786,6019,6019,pri,Wilfred,Katharina Kub,Prof.,5,DE.OBIE.Principal,vena.gutkowski@example.com,+58(2)526845,"[{'AddressType': 'Sout', 'AddressLine': '1346 ..."


In [10]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    CustomerProfile_df = CustomerProfile_df.append(CustomerProfile_df.sample(frac=0.65,replace = True))

len(CustomerProfile_df)

11086020

In [11]:
#Volcamos 5M de registros para su uso posterior
CustomerProfile_df = CustomerProfile_df.sample(2500000)
len(CustomerProfile_df)

2500000

In [12]:
#Modificamos la clave unida PartyId con uuid
CustomerProfile_df['PartyId'] = CustomerProfile_df['PartyId'].map(lambda x: str(uuid.uuid1()))

In [13]:
CustomerProfile_df.sample(1)

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone,Address
973223,574f8832-ba4b-11ec-9dcd-892344612080,3563,sol,Jammie,Eulalia Jenkins IV,Dr.,4,US.OBIE.Principal,hector60@example.org,+43(8)039850,"[{'AddressType': 'East', 'AddressLine': '3153 ..."


In [14]:
#Volcado fichero JSON
#CustomerProfile_df.to_json(CustomerProfileCollection_file_out,orient ='records') #Final file
#print("Guardado fichero en", CustomerProfileCollection_file_out)

#Volcado fichero JSON reducido para pruebas (Sample)
CustomerProfile_df.to_json(CustomerProfileCollection_sample_out,orient ='records') #Sample for testing
print("Guardado fichero en", CustomerProfileCollection_sample_out)

Guardado fichero en ../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json


# Generación colección CurrentAccount para MongoDB

In [15]:
#Carga de datasets necesarias para la colección
AccountInfo_df = pd.read_csv('../MockData/AccountInfo_1M.csv')
CurrentAccount_df = pd.read_csv('../MockData/CurrentAccount_1M.csv')

In [16]:
AccountInfo_df.columns

Index(['AccountInfoId', 'AccountId', 'SchemeName', 'Identification', 'Name'], dtype='object')

In [17]:
#Limpieza de columnas residuales y no necesarias en el documento
del(AccountInfo_df['AccountInfoId'])
del(AccountInfo_df['AccountId'])

In [18]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType
688161,681,8278,Disabled,1981-12-27 07:39:19.00,Business,transform killer functionalities,1996-02-29 02:21:29.00,SAI


In [19]:
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
1094552,UK.business,3.0,Hilll-Kiehn


In [20]:
#Creamos un sample. Testing (comentar en generación real)
CurrentAccount_df = CurrentAccount_df.sample(10000)
CurrentAccount_df['AccountId'] = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))
AccountInfo_df = AccountInfo_df.sample(10000)

In [21]:
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
412904,,4.0,Feeney-Erdman


In [22]:
#Generamos la colección agregando AccountInfo a las cuentas

AccountInfoColumn = []
CurrentAccountCollection = pd.DataFrame()

len(CurrentAccount_df)

for i in range (0,len(CurrentAccount_df)):
    account_info_ls = []
    for j in range(1,3):
        accountInfo = {
            'SchemeName' : random.choice(['ES.OBIE.SortCodeAccountNumber', 'UK.OBIE.SortCodeAccountNumber', 'UK.business']),                    
            'Identification': random.choice([0,1,2,3,4,5,6,7,8,9]),   
            'Name': AccountInfo_df.sample(1)['Name']
        }
        account_info_ls.append(accountInfo)
    AccountInfoColumn.append(account_info_ls)    
    
CurrentAccount_df['AccountInfo'] = AccountInfoColumn


In [23]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
1202903,690ff480-ba4b-11ec-9dcd-892344612080,3914,Disabled,1979-08-08 18:37:55.00,Particular,morph frictionless networks,1990-06-21 20:54:15.00,EDP,"[{'SchemeName': 'UK.business', 'Identification..."


In [24]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    CurrentAccount_df = CurrentAccount_df.append(CurrentAccount_df.sample(frac=0.65,replace = True))

len(CurrentAccount_df)

11086020

In [25]:
#Volcamos 5M de registros para su uso posterior
CurrentAccount_df = CurrentAccount_df.sample(2500000)
len(CurrentAccount_df)

2500000

In [26]:
print(CurrentAccount_df.index.is_unique)
print(CurrentAccount_df.index.duplicated())
CurrentAccount_df.drop_duplicates(subset=['AccountId'], keep='last')
CurrentAccount_df.reset_index()
print(CurrentAccount_df.index.is_unique)
print(CurrentAccount_df.index.duplicated())
len(CurrentAccount_df)

False
[False False False ...  True  True  True]
False
[False False False ...  True  True  True]


2500000

In [27]:
CustomerProfile_df
print(CustomerProfile_df.index.is_unique)
print(CustomerProfile_df.index.duplicated())
CustomerProfile_df.drop_duplicates(subset=['PartyId'], keep='last')
CustomerProfile_df.reset_index()
print(CustomerProfile_df.index.is_unique)
print(CustomerProfile_df.index.duplicated())
len(CustomerProfile_df)

False
[False False False ...  True  True  True]
False
[False False False ...  True  True  True]


2500000

In [28]:
#Modificamos la clave unida PartyId con uuid
CurrentAccount_df['AccountId'] = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))

In [29]:
CurrentAccount_df.sample(1)

Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
150970,782fad20-ba4b-11ec-9dcd-892344612080,9462,Enabled,1979-03-06 21:36:05.00,Particular,optimize global interfaces,2017-05-19 21:46:16.00,EDP,[{'SchemeName': 'UK.OBIE.SortCodeAccountNumber...


In [30]:
CustomerProfile_df.reindex()

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone,Address
296844,5259f1f0-ba4b-11ec-9dcd-892344612080,7162,sol,Sharon,Conor Metz,Miss,1,UK.OBIE.Secundary,alba26@example.com,1-094-428-63,"[{'AddressType': 'Port', 'AddressLine': '6564 ..."
694839,5259f362-ba4b-11ec-9dcd-892344612080,1671,pri,Manley,Julius Dibbert,Dr.,5,US.OBIE.Principal,amohr@example.org,382.682.3551,"[{'AddressType': 'West', 'AddressLine': '51238..."
1277301,5259f3b2-ba4b-11ec-9dcd-892344612080,3602,sol,Roel,Remington Considine,Prof.,5,DE.OBIE.Principal,trey77@example.org,1-634-355-73,"[{'AddressType': 'Lake', 'AddressLine': '76173..."
625853,5259f3da-ba4b-11ec-9dcd-892344612080,290,sol,Anna,Jed Weber,Prof.,6,DE.OBIE.Principal,alba04@example.net,(139)290-134,"[{'AddressType': 'West', 'AddressLine': '51238..."
360619,5259f402-ba4b-11ec-9dcd-892344612080,2710,pri,Sigrid,Claire Haag,Dr.,9,DE.OBIE.Principal,lucas.nicolas@example.org,885-688-8917,"[{'AddressType': 'Nort', 'AddressLine': '60642..."
...,...,...,...,...,...,...,...,...,...,...,...
1010309,575677e6-ba4b-11ec-9dcd-892344612080,7956,sol,Monica,Dr. Marina Gerhold DVM,Ms.,3,UK.OBIE.Secundary,hunter34@example.net,1-616-535-00,"[{'AddressType': 'West', 'AddressLine': '88932..."
443128,57567804-ba4b-11ec-9dcd-892344612080,4641,pri,Horacio,Sienna Schmidt,Dr.,1,US.OBIE.Principal,koss.torrey@example.org,1-412-679-65,"[{'AddressType': 'Lake', 'AddressLine': '507 H..."
457625,57567822-ba4b-11ec-9dcd-892344612080,97,pri,Krystel,Jarrell Beahan,Mr.,8,UK.OBIE.Secundary,cassin.arianna@example.com,(108)316-413,"[{'AddressType': 'New', 'AddressLine': '760 Ad..."
767840,5756784a-ba4b-11ec-9dcd-892344612080,1909,pri,Jaida,Miss Martina Mann,Dr.,3,US.OBIE.Principal,rice.camren@example.net,+63(4)198512,"[{'AddressType': 'New', 'AddressLine': '802 Ka..."


In [31]:
CurrentAccount_df.set_index('AccountId')
CurrentAccount_df.reindex()
CurrentAccount_df['PartyId'] = CustomerProfile_df['PartyId'].values
CurrentAccount_df.sample(1)


Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
601371,7a162a88-ba4b-11ec-9dcd-892344612080,55ae3cb2-ba4b-11ec-9dcd-892344612080,Enabled,1976-02-23 23:37:06.00,Business,benchmark dynamic ROI,1997-01-19 15:27:02.00,SAI,"[{'SchemeName': 'UK.business', 'Identification..."


In [32]:
#Volcado fichero JSON
CurrentAccount_df.to_json(CurrentAccountCollection_sample_out,orient ='records') #Sample for testing

#CurrentAccount_df.to_json(CurrentAccountCollection_file_out,orient ='records')

# Generación colección PositionKeeping para MongoDB

In [33]:
#Carga de datasets necesarias para la colección
Amount_df = pd.read_csv('../MockData/Amount_1M.csv')
CreditLine_df = pd.read_csv('../MockData/CreditLine_1M.csv')
Currency_df = pd.read_csv('../MockData/base/Currency.csv')
PositionKeeping_df = pd.read_csv('../MockData/Position_Keeping_1M.csv')

In [34]:
#Verificación de columnas
print(Amount_df.columns)
print(Country_df.columns)
print(CreditLine_df.columns)
print(Currency_df.columns)
print(PositionKeeping_df.columns)

Index(['AmountId', 'CurrencyId', 'Amount'], dtype='object')
Index(['ShortName', 'Description', 'Code'], dtype='object')
Index(['CreditLineId', 'CurrencyId', 'Amount', 'Type', 'Included'], dtype='object')
Index(['CurrencyId', 'Code', 'Description', 'Unnamed: 3'], dtype='object')
Index(['AccountId', 'DateTime', 'CreditDebitIndicator', 'Type', 'AmountId',
       'CreditLineId'],
      dtype='object')


In [35]:
#Limpieza de columnas residuales y no necesarias en el documento
del(Amount_df['CurrencyId'])
del(Amount_df['AmountId'])

del(CreditLine_df['CreditLineId'])
del(CreditLine_df['CurrencyId'])

del(Currency_df['CurrencyId'])

del(PositionKeeping_df['CreditLineId'])
del(PositionKeeping_df['AmountId'])
del(PositionKeeping_df['AccountId'])

In [36]:
#Creamos samples. Testing (comentar en generación real)
Amount_df = Amount_df.sample(10000)
CreditLine_df = CreditLine_df.sample(10000)
PositionKeeping_df = PositionKeeping_df.sample(10000)

In [37]:
PositionKeeping_df.sample(3)

Unnamed: 0,DateTime,CreditDebitIndicator,Type
291864,2003-10-08 07:28:12.00,Credit,Business
759399,1995-06-03 22:09:59.00,Credit,Particular
773171,1991-10-01 02:38:28.00,Debit,Business


In [38]:
CreditLine_df.sample(1)['Amount']

838580    80.99
Name: Amount, dtype: float64

In [39]:
Currency_df['Code']

0    IND
1    GBP
2    EUR
3    USA
Name: Code, dtype: object

In [40]:
round(random.uniform(-3000, 99999999),2)

42666587.27

In [41]:
#Generamos la colección PositionKeeping

AmountColumn = []
CreditLineColumn = []

for i in range (0,len(PositionKeeping_df)):    
    currency_value = Currency_df.sample(1) 
    valueAmount = { 'Currency': {
                        "Code": random.choice(Currency_df['Code']),
                        "Description": random.choice(Currency_df['Description']),
                },
               'Amount' : round(random.uniform(-3000, 99999999),2)
              }
    AmountColumn.append(valueAmount)  
    creditline_value = CreditLine_df.sample(1)
    valueCreditLine = {
    'Included' : random.choice([0,1]),                    
    'Type': random.choice(['Credit', 'Debit']),
    'Amount': { 'Currency': {
                        "Code": random.choice(Currency_df['Code']),
                        "Description": random.choice(Currency_df['Description']),
                },
               'Amount' : round(random.uniform(-3000, 99999999),2)
              }
    }
    CreditLineColumn.append(valueCreditLine)


PositionKeeping_df['Amount'] = AmountColumn
PositionKeeping_df['CreditLine'] = CreditLineColumn
print(PositionKeeping_df.sample(1))

                      DateTime CreditDebitIndicator      Type  \
184560  1994-08-31 08:02:29.00                Debit  Business   

                                                   Amount  \
184560  {'Currency': {'Code': 'EUR', 'Description': 'D...   

                                               CreditLine  
184560  {'Included': 1, 'Type': 'Credit', 'Amount': {'...  


In [42]:
len(PositionKeeping_df)

10000

In [43]:
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine
979751,1991-01-13 17:51:32.00,Debit,Business,"{'Currency': {'Code': 'IND', 'Description': 'U...","{'Included': 0, 'Type': 'Debit', 'Amount': {'C..."


In [44]:
#Generamos 5M de registros basados en datos previos con PartyId unico
for x in range(14): # Generamos registros
    PositionKeeping_df = PositionKeeping_df.append(PositionKeeping_df.sample(frac=0.65,replace = True))

#Volcamos 5M de registros para su uso posterior
PositionKeeping_df = PositionKeeping_df.sample(2500000)
len(PositionKeeping_df)

2500000

In [45]:
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine
88062,2016-06-14 16:08:22.00,Debit,Particular,"{'Currency': {'Code': 'USA', 'Description': 'U...","{'Included': 1, 'Type': 'Debit', 'Amount': {'C..."


In [46]:
PositionKeeping_df['AccountId'] = CurrentAccount_df['AccountId'].values
PositionKeeping_df.set_index('AccountId')
PositionKeeping_df.reindex()
PositionKeeping_df.sample(1)

Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine,AccountId
826211,1994-07-15 21:05:02.00,Credit,Particular,"{'Currency': {'Code': 'EUR', 'Description': 'U...","{'Included': 1, 'Type': 'Credit', 'Amount': {'...",7b205aa2-ba4b-11ec-9dcd-892344612080


In [47]:
#Volcado fichero JSON
PositionKeeping_df.to_json(PositionKeepingCollection_sample_out,orient ='records') #Final file
#PositionKeeping_df.to_json(PositionKeepingCollection_file_out,orient ='records') #Final file
