Script de medición y carga de las colecciones en cluster Cassandra
TFM Daniel Herranz Segundo

In [1]:
!pip install cassandra-driver
!pip install install scales
!pip install numpy



In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import time
from pprint import pprint
import psutil
import uuid
from cassandra.query import tuple_factory
from cassandra.query import dict_factory
from cassandra.query import BatchStatement, SimpleStatement
from cassandra.policies import RetryPolicy

In [2]:
def save_results_to_csv(results,file,consistencyLevel):
    #Guardamos los resultados en csv
    from datetime import datetime
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S") 
    data = results
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    file = file.format(consistencyLevel, str(dia))
    print(file)
    results_df.to_csv(file)

In [3]:
resultados_etl_CutomerProfileKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_CustomerProfile_{}.csv'
resultados_etl_PositionKeepingKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_PositionKeeping_{}.csv'
resultados_etl_CurrentAccountKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_CurrentAccount_{}.csv'

In [4]:
#repeticiones
repeats = 1000
test_wc = False

In [6]:
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.policies import WhiteListRoundRobinPolicy, DowngradingConsistencyRetryPolicy
from cassandra.query import tuple_factory
from cassandra import ConsistencyLevel

profile = ExecutionProfile(
    load_balancing_policy=WhiteListRoundRobinPolicy(['127.0.0.1','172.17.0.2','172.17.0.3','172.17.0.4' ]),
    retry_policy=DowngradingConsistencyRetryPolicy(),
    consistency_level=ConsistencyLevel.ALL,
    serial_consistency_level=ConsistencyLevel.LOCAL_SERIAL,
    request_timeout=3600,
    row_factory=tuple_factory
)
cluster = Cluster(execution_profiles={EXEC_PROFILE_DEFAULT: profile})
session = cluster.connect()
print(session.execute("SELECT release_version FROM system.local").one())

session.execute('USE currentaccountkeyspace')

('4.0.3',)


<cassandra.cluster.ResultSet at 0x7f86a58c29d0>

## Carga de los documentos por cada dominio

In [7]:
CurrentAccountKeyspace_file_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountKeyspace.csv'
CurrentAccountKeyspaceAccountInfo_file_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountAccountInfoKeyspace.csv'
PositionKeepingKeyspace_file_out = '../MockData/Cassandra/PositionKeepingKeyspace/PositionKeepingKeyspace.csv'
CustomerProfileKeyspace_file_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileKeyspace.csv'
CustomerProfileAddressKeyspace_file_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileAddressKeyspace.csv'

CurrentAccountKeyspace_sample_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountKeyspace_sample.csv'
CurrentAccountKeyspaceAccountInfo_sample_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountAccountInfoKeyspace_sample.csv'
PositionKeepingKeyspace_sample_out = '../MockData/Cassandra/PositionKeepingKeyspace/PositionKeepingKeyspace_sample.csv'
CustomerProfileKeyspaceCustomer_sample_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileKeyspace_sample.csv'
CustomerProfileKeyspaceAddress_sample_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileAddressKeyspace_sample.csv'

### Test de carga CustomerProfileCollection

In [27]:
#Carga de la información de dataframes por entidades
CustomerProfileCollection_df = pd.read_csv(CustomerProfileKeyspaceCustomer_sample_out) #Carga Sample
Address_df = pd.read_csv(CustomerProfileKeyspaceAddress_sample_out) #Carga Sample

#CustomerProfileCollection_df = pd.read_json(CustomerProfileCollection_file_out) 
#CurrentAccountCollection_df = pd.read_json(CurrentAccountCollection_file_out) 
#PositionKeepingCollection_df = pd.read_json(PositionKeepingCollection_file_out) 

In [28]:
print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("Address_df",len(Address_df))

CustomerProfileCollection_df 1000
Address_df 1000


In [29]:
for x in range(17): # Generamos un millon de registros aprox
    CustomerProfileCollection_df = CustomerProfileCollection_df.append(CustomerProfileCollection_df.sample(frac=0.60,replace = True))

for x in range(17):
    Address_df = Address_df.append(Address_df.sample(frac=0.60,replace = True))


In [30]:
print(len(CustomerProfileCollection_df))
print(len(Address_df))

2951634
2951634


In [31]:
num_documentos = 1000000

In [32]:
CustomerProfileCollection_df = CustomerProfileCollection_df.sample(num_documentos)
print(len(CustomerProfileCollection_df))
CustomerProfileCollection_df.sample(1)

1000000


Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone
984,4928,4928,sol,Crystel,Ms. Euna Kihn,Dr.,4,US.OBIE.Principal,alivia.wilkinson@example.net,1-615-618-45


In [33]:
Address_df = Address_df.sample(num_documentos)
print(len(Address_df))
Address_df.sample(1)

1000000


Unnamed: 0,PartyId,AddressType,AddressLine,StreetName,BuildingNumber,PostCode,TownName,CountrySubDivision,CountryId,Country_Code,Country_ShortName,Country_Description
482,775441,West,3229 Haley Mountains Suite 241,Missouri Dale,543,45955-0753,Thompsonshire,32787241.0,3,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...


In [34]:
sample_mode = True #En cargas masivas reales, poner a false

if(sample_mode):
    list_partyId = CustomerProfileCollection_df['PartyId'].map(lambda x: str(uuid.uuid1()))
    CustomerProfileCollection_df['PartyId'] = list_partyId

In [35]:
CustomerProfileCollection_df.sample(5)

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone
60,888ad4f2-c08f-11ec-b859-8d4f375c14a8,3494,pri,Lonie,Ariane Satterfield,Dr.,6,US.OBIE.Principal,dave43@example.com,064.154.6629
480,882de1ac-c08f-11ec-b859-8d4f375c14a8,7278,sol,Josephine,Abdullah Swift,Prof.,2,US.OBIE.Principal,mollie03@example.net,07868407036
97,891e4c6e-c08f-11ec-b859-8d4f375c14a8,3922,pri,Jean,Whitney Gulgowski,Mrs.,2,DE.OBIE.Principal,koss.kaci@example.net,105.061.0322
605,895e61c8-c08f-11ec-b859-8d4f375c14a8,9494,pri,Heath,Ivah Stehr,Mrs.,2,US.OBIE.Principal,hyatt.arden@example.com,+10(8)465193
203,8833d530-c08f-11ec-b859-8d4f375c14a8,3766,pri,Ulices,Ms. Daisha Toy III,Dr.,7,UK.OBIE.Principal,gorczany.lucio@example.org,230-726-0311


In [36]:
del(Address_df['PartyId'])
Address_df.sample(5)

Unnamed: 0,AddressType,AddressLine,StreetName,BuildingNumber,PostCode,TownName,CountrySubDivision,CountryId,Country_Code,Country_ShortName,Country_Description
3,West,3246 Veum Coves Suite 975,Orie Circle,465,89545-0393,Hoegerside,9.0,2,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...
92,Nort,7504 Goldner Ridge,Walsh Street,36257,77295-4427,Isabellemouth,24379.0,2,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...
720,Nort,916 Stehr Forge,Marjory Coves,17984,19151-8138,East Elsafurt,539928.0,1,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...
980,Sout,076 Kub Mall,Prosacco Throughway,65264,32060-8205,Kaleighfurt,7086.0,1,GBP,USA,Id dolorem a soluta harum iusto qui repellat. ...
344,Sout,6481 Terry Parkway Suite 551,Simonis Key,15174,02484,Lenoreview,26250.0,2,GBP,USA,Id dolorem a soluta harum iusto qui repellat. ...


In [162]:
import random

def getRandomInsertAddress():    
    time_inicial = time.time()
    AddressSample = Address_df.sample(1);
    #CustomerSample = CustomerProfileCollection_df.sample(1)
    
    #('c59ef4f2-bfb5-11ec-b0d9-ff5473b2f02a', 'IND', '2724 Charity Station', 'West', '784', 'Omnis dolore id dicta consequatur sit omnis. Eaque impedit debitis consequatur impedit aut quo et blanditiis.', 'Spain', '36623111.0', '26429', 'Antone Cliff', 'West Marieport')
    
    values = (
        random.choice(list_partyId),
        str(AddressSample['Country_Code'].iloc[0]).replace("'",""),
        str(AddressSample['AddressLine'].iloc[0]).replace("'",""),
        str(AddressSample['AddressType'].iloc[0]).replace("'",""),
        str(AddressSample['BuildingNumber'].iloc[0]).replace("'",""),
        str(AddressSample['Country_Description'].iloc[0]).replace("'",""),
        str(AddressSample['Country_ShortName'].iloc[0]).replace("'",""),
        str(AddressSample['CountrySubDivision'].iloc[0]).replace("'",""),
        str(AddressSample['PostCode'].iloc[0]).replace("'",""),
        str(AddressSample['StreetName'].iloc[0]).replace("'",""),
        str(AddressSample['TownName'].iloc[0]).replace("'","")
    )
    print(time.time() - time_inicial)
    return values

In [185]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra import concurrent

registers = []
batch = BatchStatement(consistency_level=ConsistencyLevel.ALL)
INSERT_STMT = """INSERT INTO CustomerProfile (partyid, partynumber, partytype, name, fulllegalname, legalstructure, beneficialownership, accountrole, emailaddress, phone) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
#INSERT_ADDRESS_STMT = """INSERT INTO CustomerProfileAddress (partyid,country_code,addressline, addresstype, buildingnumber,  country_description, country_shortname, countrysubdivision, postcode, streetname, townname) VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');"""
INSERT_ADDRESS_STMT = """INSERT INTO CustomerProfileAddress (partyid,country_code,addressline, addresstype, buildingnumber,  country_description, country_shortname, countrysubdivision, postcode, streetname, townname) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
batch.clear()
#Cargas Masiva con Many
def loadCollectionDataFrameCustomerProfile(df,keyspace):
    session = cluster.connect(keyspace)
    BATCH_SIZE = 50
    TRX_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    
    for h,item in df.iterrows(): 
        
        values = (
            str(item[0].replace("'","")),
            str(item[1]),
            str(item[2].replace("'","")),
            str(item[3].replace("'","")),
            str(item[4].replace("'","")),
            str(item[5].replace("'","")),
            str(item[6]),
            str(item[7].replace("'","")),
            str(item[8].replace("'","")),
            str(item[9].replace("'",""))
        )
            
        batch.add(INSERT_STMT, values) 
        
        values_add = (
            str(item[0].replace("'","")),
            random.choice(['SPA','GBP', 'IND', 'USA']), 
            'addressline', 
            'Type', 
            'BuildingNumber',
            'Country_Description',
            'Country_ShortName',
            'CountrySubDivision',
            'PostCode',
            'StreetName',
            'TownName'
        )
        
        batch.add(INSERT_ADDRESS_STMT, values_add)
                                  
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH
            session = cluster.connect(keyspace)
            session.execute(batch)
            batch.clear()
        if(i % TRX_SIZE == 0 and i > 0):
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))
            if(i % 10000 == 0):
                print((iter,data_time_collection,used_cpu,mem_used))
            iter += TRX_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [186]:
session.execute('TRUNCATE CustomerProfileKeySpace.CustomerProfile;')
session.execute('TRUNCATE CustomerProfileKeySpace.CustomerProfileAddress;')

<cassandra.cluster.ResultSet at 0x7f8685163990>

In [187]:
registers = loadCollectionDataFrameCustomerProfile(CustomerProfileCollection_df, 'customerprofilekeyspace')

(9000, 0.377, 32.5, 93.9)
(19000, 0.375, 33.0, 92.6)
(29000, 0.367, 32.5, 92.6)
(39000, 0.379, 35.1, 92.6)
(49000, 0.374, 38.8, 92.6)
(59000, 0.381, 35.1, 92.6)
(69000, 0.378, 31.7, 92.6)
(79000, 0.382, 35.1, 92.6)
(89000, 0.383, 33.5, 92.6)
(99000, 0.383, 29.2, 92.6)
(109000, 0.38, 33.3, 92.6)
(119000, 0.381, 34.1, 92.6)
(129000, 0.386, 32.0, 92.6)
(139000, 0.374, 32.5, 92.7)
(149000, 0.411, 38.8, 92.7)
(159000, 0.376, 35.3, 92.6)
(169000, 0.409, 38.1, 92.6)
(179000, 0.382, 35.2, 92.6)
(189000, 0.373, 34.3, 92.6)
(199000, 0.381, 33.2, 92.6)
(209000, 0.389, 35.9, 92.6)
(219000, 0.422, 38.0, 92.6)
(229000, 0.411, 52.7, 92.6)
(239000, 0.426, 40.4, 92.7)
(249000, 0.377, 35.6, 92.6)
(259000, 0.386, 34.9, 92.6)
(269000, 0.441, 51.4, 92.6)
(279000, 0.378, 34.9, 92.6)
(289000, 0.373, 34.3, 92.5)
(299000, 0.412, 39.4, 92.6)
(309000, 0.378, 33.5, 92.6)
(319000, 0.377, 33.5, 92.6)
(329000, 0.383, 36.2, 92.6)
(339000, 0.388, 33.0, 92.5)
(349000, 0.374, 35.0, 92.6)
(359000, 0.378, 34.3, 92.6)
(369

In [188]:
#Guardamos el caso one-to-one cassandra
save_results_to_csv(registers,resultados_etl_CutomerProfileKeySpace,'ALL')

../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_ALL_CustomerProfile_20042022_13_54_29.csv


## CurrentAccount

In [144]:
session = cluster.connect('currentaccountkeyspace')
session.execute('TRUNCATE TABLE currentaccount;')
session.execute('TRUNCATE TABLE currentaccountbyschemename;')

<cassandra.cluster.ResultSet at 0x7f31e16470d0>

In [145]:
CurrentAccount_df = pd.read_csv(CurrentAccountKeyspace_sample_out) #Carga Sample
AccountInfo_df = pd.read_csv(CurrentAccountKeyspaceAccountInfo_sample_out) #Carga Sample

In [146]:
del(AccountInfo_df['AccountId'])
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
58,,6.0,Hagenes-Stracke


In [147]:
for x in range(17): # Generamos un millon de registros aprox
    CurrentAccount_df = CurrentAccount_df.append(CurrentAccount_df.sample(frac=0.60,replace = True))

In [148]:
print(len(CurrentAccount_df))
print(len(AccountInfo_df))

2951634
250


In [149]:
del(CurrentAccount_df['PartyId'])
CurrentAccount_df = CurrentAccount_df.sample(num_documentos)
len(CurrentAccount_df)

1000000

In [150]:
sample_mode = True #En cargas masivas reales, poner a false

if(sample_mode):     
    list_ca = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))
    CurrentAccount_df['AccountId'] = list_ca
    CurrentAccount_df.set_index('AccountId')


In [151]:
CurrentAccount_df['PartyId'] = list_partyId.values
shiftPos = CurrentAccount_df.pop('PartyId')
CurrentAccount_df.insert(0, 'PartyId', shiftPos)

In [152]:
CurrentAccount_df.sample(3)

Unnamed: 0,PartyId,AccountId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType
518,bd29e742-bfc8-11ec-b0d9-ff5473b2f02a,484c4a6c-bfcf-11ec-b0d9-ff5473b2f02a,Enabled,2007-10-03 19:47:09.00,Business,utilize integrated schemas,2006-06-04 22:31:36.00,SAI
821,bd83607e-bfc8-11ec-b0d9-ff5473b2f02a,48a6b43e-bfcf-11ec-b0d9-ff5473b2f02a,Enabled,1990-06-20 10:48:10.00,Business,deliver interactive interfaces,1991-11-15 20:22:02.00,SAI
147,be0f6466-bfc8-11ec-b0d9-ff5473b2f02a,49342c92-bfcf-11ec-b0d9-ff5473b2f02a,Disabled,1976-03-17 20:27:44.00,Business,cultivate viral applications,2011-07-13 17:02:21.00,POI


In [153]:
import random

def getRandomAccountInfoList():
    ai_list = []
    num_add = random.randint(2, 4)
    #print("num_add:", num_add)
    for i in range(0,num_add):        
        ai_sample = AccountInfo_df.sample(1);       
        tuple_add = (            
            str(ai_sample['Name'].iloc[0]).replace("'",""),
            str(ai_sample['Identification'].iloc[0]).replace("'",""),
            str(ai_sample['SchemeName'].iloc[0]).replace("'","")
        ) 
        ai_list.append(tuple_add)  
    return ai_list

In [154]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra import concurrent

registers = []
batch = BatchStatement(consistency_level=ConsistencyLevel.ALL)

#Cargas Masiva con Many
def loadCollectionDataFrameCurrentAccount(df,keyspace):
    session = cluster.connect(keyspace)
    BATCH_SIZE = 90
    TRX_SIZE = 1000
    iter = 0;
    i = 1
        
    time_inicial = time.time()
    for h,item in df.iterrows(): 

        values = (
            str(item[0].replace("'","")),
            str(item[1].replace("'","")),
            str(item[2].replace("'","")),
            str(item[3].replace("'","")),
            str(item[4].replace("'","")),
            str(item[5].replace("'","")),
            str(item[6].replace("'","")))
        
        INSERT_STMT = """INSERT INTO CurrentAccount (partyid, accountid, status, statusupdatedatetime, accountType, nickName, openingDate ) VALUES (%s, %s, %s, %s, %s, %s, %s);"""            
        #session.execute(INSERT_STMT, values)
        batch.add(INSERT_STMT, values)
        
        INSERT_STMT_INFO = """UPDATE CurrentAccount SET accountinfo = {} WHERE accountid = '{}';""".format( getRandomAccountInfoList(), str(item[1].replace("'","")))                    
        #session.execute(INSERT_STMT_INFO)
        batch.add(INSERT_STMT_INFO)
        
        INSERT_STMT_SCHEMENAME = """INSERT INTO currentaccountbyschemename (accountid, schemename) values (%s, %s);"""                    
        #session.execute(INSERT_STMT_SCHEMENAME,( str(item[1].replace("'","")), str(AccountInfo_df.sample(1)['SchemeName'].iloc[0]).replace("'","") ))
        batch.add(INSERT_STMT_SCHEMENAME,( str(item[1].replace("'","")), str(AccountInfo_df.sample(1)['SchemeName'].iloc[0]).replace("'","") ))
                                      
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH
            session = cluster.connect(keyspace)
            session.execute(batch)
            batch.clear()
        if(i % TRX_SIZE == 0 and i > 0):
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))
            if(i % 10000 == 0 and i > 0):
                print((iter,data_time_collection,used_cpu,mem_used))
            iter += TRX_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [155]:
registers = loadCollectionDataFrameCurrentAccount(CurrentAccount_df, 'currentaccountkeyspace')

(9000, 2.205, 16.3, 93.7)
(19000, 2.343, 16.6, 93.7)
(29000, 2.255, 17.3, 93.6)
(39000, 2.267, 19.1, 93.7)
(49000, 2.193, 17.5, 93.7)
(59000, 2.193, 16.3, 93.7)
(69000, 2.247, 16.6, 93.6)
(79000, 2.191, 18.6, 93.7)
(89000, 2.488, 27.4, 93.8)
(99000, 2.376, 21.8, 93.8)
(109000, 2.328, 21.7, 93.8)
(119000, 2.44, 24.5, 93.9)
(129000, 2.323, 29.7, 94.0)
(139000, 2.404, 68.4, 97.2)
(149000, 2.223, 34.6, 97.2)
(159000, 2.324, 37.7, 93.8)
(169000, 2.231, 24.9, 94.1)
(179000, 2.214, 46.5, 94.0)
(189000, 2.416, 21.9, 93.1)
(199000, 2.129, 34.3, 94.5)
(209000, 2.198, 23.3, 93.5)
(219000, 2.227, 27.3, 93.5)
(229000, 2.292, 19.2, 92.9)
(239000, 2.32, 18.6, 92.7)
(249000, 2.354, 19.7, 92.8)
(259000, 2.281, 18.2, 92.7)
(269000, 2.378, 18.8, 92.7)
(279000, 2.274, 36.3, 93.2)
(289000, 2.221, 37.1, 93.3)
(299000, 2.355, 32.0, 93.5)
(309000, 2.238, 36.6, 93.5)
(319000, 2.365, 23.4, 93.4)
(329000, 2.372, 26.6, 93.4)
(339000, 2.247, 24.3, 93.3)
(349000, 2.205, 35.5, 93.5)
(359000, 2.219, 33.1, 93.2)
(3690

In [156]:
#Guardamos el caso one-to-one cassandra
save_results_to_csv(registers,resultados_etl_CurrentAccountKeySpace,'ALL')

../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_ALL_CurrentAccount_19042022_13_34_21.csv


In [18]:
PositionKeeping_df = pd.read_csv(PositionKeepingKeyspace_sample_out) #Carga Sample
PositionKeeping_df.sample(1)

Unnamed: 0,AccountId,DateTime,CreditDebitIndicator,Type,Amount_Amount,Amount_Currency_Description,Amount_Currency_Code,CreditLine_Amount,CreditLine_Currency_Description,CreditLine_Currency_Code,CreditLine_Type,CreditLine_Included
64,7603,2017-01-19 20:12:14.00,Credit,Particular,73259.854326,Est quidem repellendus dolor qui. Eos temporib...,USA,0.0,Ut cumque sint laudantium quis impedit. Qui do...,GBP,,0


In [19]:
for x in range(17): # Generamos un millon de registros aprox
    PositionKeeping_df = PositionKeeping_df.append(PositionKeeping_df.sample(frac=0.60,replace = True))

In [20]:
PositionKeeping_df = PositionKeeping_df.sample(num_documentos)

In [21]:
session.execute('USE currentaccountkeyspace')
print("""SELECT AccountId FROM currentaccountkeyspace.currentaccount;""".format(num_documentos))
#result = session.execute("""SELECT AccountId FROM currentaccountkeyspace.currentaccount limit {};""".format(num_documentos))
result = session.execute("""SELECT AccountId FROM currentaccountkeyspace.currentaccount;""")
list_ca = []
for account in result:
    list_ca.append(account[0])
print(len(list_ca))

SELECT AccountId FROM currentaccountkeyspace.currentaccount;
999879


In [22]:
PositionKeeping_df['AccountId'] = PositionKeeping_df['AccountId'].map(lambda x: random.choice(list_ca))
PositionKeeping_df.sample(1)

Unnamed: 0,AccountId,DateTime,CreditDebitIndicator,Type,Amount_Amount,Amount_Currency_Description,Amount_Currency_Code,CreditLine_Amount,CreditLine_Currency_Description,CreditLine_Currency_Code,CreditLine_Type,CreditLine_Included
128,48b4e8ce-bfcf-11ec-b0d9-ff5473b2f02a,2007-04-15 12:35:54.00,Credit,Particular,0.0,Ut necessitatibus in iure cum et sint. Minima ...,USA,1548.881492,Est quidem repellendus dolor qui. Eos temporib...,IND,,0


In [24]:
batch = BatchStatement(consistency_level=ConsistencyLevel.ALL)

def loadCollectionDataFramePositionKeepingBatch(df,keyspace):
    registers = []
    session = cluster.connect(keyspace)
    BATCH_SIZE = 50
    TRX_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    for h,item in df.iterrows(): 
        #print(item)
        values = ( 
            str(item[0].replace("'","")),
            str(item[1].replace("'","")),
            str(item[2].replace("'","")),
            str(item[3].replace("'","")),
            item[4],
            str(item[5].replace("'","")),
            str(item[6].replace("'","")),
            item[7],
            str(item[9].replace("'","")),
            'typed',
            bool(item[11]))
        INSERT_STMT = """INSERT INTO PositionKeeping (
        accountid,
        datetime,
        creditdebtindicator,
        type,
        amount_amount,
        amount_currency_description,
        amount_currency_code, 
        credit_line_amount, 
        credit_line_currency_code, 
        creditline_type, 
        creditline_included) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s , %s);"""
        
        #print(INSERT_STMT)
        batch.add(INSERT_STMT, values)
                        
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH
            session = cluster.connect(keyspace)
            session.execute_async(batch) 
            batch.clear()
        if(i % TRX_SIZE == 0 and i > 0):
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))  
            if(i % 10000 == 0 and i > 0):
                print((iter,data_time_collection,used_cpu,mem_used))
            iter += TRX_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [25]:
def f(x):
    try:
        return np.float(x)
    except:
        return np.nan
    
#PositionKeeping_df.fillna(0, inplace=True)

PositionKeeping_df["CreditLine_Amount"] = PositionKeeping_df["CreditLine_Amount"].apply(f)
PositionKeeping_df["Amount_Amount"] = PositionKeeping_df["Amount_Amount"].apply(f)


session = cluster.connect('positionkeepingkeyspace')
session.execute('TRUNCATE TABLE positionkeeping;')
registers = loadCollectionDataFramePositionKeepingBatch(PositionKeeping_df, 'positionkeepingkeyspace')

(9000, 0.327, 87.6, 92.4)
(19000, 0.312, 72.2, 92.4)
(29000, 0.274, 72.2, 92.4)
(39000, 0.269, 51.7, 92.4)
(49000, 0.269, 51.3, 92.4)
(59000, 0.264, 50.0, 92.3)
(69000, 0.256, 48.2, 92.3)
(79000, 0.256, 40.6, 92.4)
(89000, 0.267, 51.6, 92.4)
(99000, 0.258, 40.9, 92.4)
(109000, 0.25, 36.5, 92.4)
(119000, 0.258, 38.8, 92.3)
(129000, 0.249, 44.7, 92.3)
(139000, 0.285, 37.7, 92.2)
(149000, 0.267, 40.3, 92.2)
(159000, 0.276, 52.7, 92.2)
(169000, 0.265, 42.3, 92.3)
(179000, 0.282, 54.0, 92.3)
(189000, 0.256, 36.2, 92.3)
(199000, 0.256, 38.2, 92.3)
(209000, 0.253, 36.2, 92.3)
(219000, 0.254, 38.3, 92.1)
(229000, 0.252, 37.7, 92.1)
(239000, 0.254, 34.6, 92.2)
(249000, 0.252, 43.8, 92.1)
(259000, 0.251, 41.3, 92.1)
(269000, 0.242, 40.5, 92.2)
(279000, 0.253, 36.6, 92.2)
(289000, 0.245, 38.0, 92.2)
(299000, 0.25, 38.4, 92.2)
(309000, 0.29, 36.9, 92.2)
(319000, 0.296, 38.3, 92.2)
(329000, 0.266, 37.3, 92.2)
(339000, 0.26, 39.2, 92.2)
(349000, 0.261, 37.3, 92.2)
(359000, 0.258, 36.6, 92.2)
(369000

In [26]:
#Guardamos el caso one-to-one cassandra
save_results_to_csv(registers,resultados_etl_PositionKeepingKeySpace,'ALL')

../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_ALL_PositionKeeping_20042022_11_44_56.csv
