Script de medición y carga de las colecciones en cluster Cassandra
TFM Daniel Herranz Segundo

In [1]:
!pip install cassandra-driver
!pip install install scales
!pip install numpy



In [2]:
import pandas as pd
import numpy as np
import os
import json
import random
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import time
from pprint import pprint
import psutil
import uuid

In [3]:
def save_results_to_csv(results,file,consistencyLevel):
    #Guardamos los resultados en csv
    from datetime import datetime
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S") 
    data = results
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    file = file.format(consistencyLevel, str(dia))
    print(file)
    results_df.to_csv(file)

In [4]:
resultados_etl_CutomerProfileKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_CustomerProfile_{}.csv'
resultados_etl_PositionKeepingKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_PositionKeeping_{}.csv'
resultados_etl_CurrentAccountKeySpace = '../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_{}_CurrentAccount_{}.csv'

In [5]:
#repeticiones
repeats = 1000
test_wc = False

In [6]:
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.policies import WhiteListRoundRobinPolicy, DowngradingConsistencyRetryPolicy
from cassandra.query import tuple_factory
from cassandra import ConsistencyLevel

profile = ExecutionProfile(
    load_balancing_policy=WhiteListRoundRobinPolicy(['127.0.0.1']),
    retry_policy=DowngradingConsistencyRetryPolicy(),
    consistency_level=ConsistencyLevel.ALL,
    serial_consistency_level=ConsistencyLevel.LOCAL_SERIAL,
    request_timeout=3600,
    row_factory=tuple_factory
)
cluster = Cluster(execution_profiles={EXEC_PROFILE_DEFAULT: profile})
session = cluster.connect()
print(session.execute("SELECT release_version FROM system.local").one())

session.execute('USE currentaccountkeyspace')

rows = session.execute('SELECT count(*) FROM currentaccount')

for row in rows:
    print(row)

('4.0.3',)
(100000,)


## Carga de los documentos por cada dominio

In [7]:
CurrentAccountKeyspace_file_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountKeyspace.csv'
CurrentAccountKeyspaceAccountInfo_file_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountAccountInfoKeyspace.csv'
PositionKeepingKeyspace_file_out = '../MockData/Cassandra/PositionKeepingKeyspace/PositionKeepingKeyspace.csv'
CustomerProfileKeyspace_file_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileKeyspace.csv'
CustomerProfileAddressKeyspace_file_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileAddressKeyspace.csv'

CurrentAccountKeyspace_sample_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountKeyspace_sample.csv'
CurrentAccountKeyspaceAccountInfo_sample_out = '../MockData/Cassandra/CurrentAccountKeyspace/CurrentAccountAccountInfoKeyspace_sample.csv'
PositionKeepingKeyspace_sample_out = '../MockData/Cassandra/PositionKeepingKeyspace/PositionKeepingKeyspace_sample.csv'
CustomerProfileKeyspaceCustomer_sample_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileKeyspace_sample.csv'
CustomerProfileKeyspaceAddress_sample_out = '../MockData/Cassandra/CustomerProfileKeyspace/CustomerProfileAddressKeyspace_sample.csv'

### Test de carga CustomerProfileCollection

In [8]:
#Carga de la información de dataframes por entidades
CustomerProfileCollection_df = pd.read_csv(CustomerProfileKeyspaceCustomer_sample_out) #Carga Sample
Address_df = pd.read_csv(CustomerProfileKeyspaceAddress_sample_out) #Carga Sample

#CustomerProfileCollection_df = pd.read_json(CustomerProfileCollection_file_out) 
#CurrentAccountCollection_df = pd.read_json(CurrentAccountCollection_file_out) 
#PositionKeepingCollection_df = pd.read_json(PositionKeepingCollection_file_out) 

In [9]:
print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("Address_df",len(Address_df))

CustomerProfileCollection_df 1000
Address_df 1000


In [10]:
for x in range(17): # Generamos un millon de registros aprox
    CustomerProfileCollection_df = CustomerProfileCollection_df.append(CustomerProfileCollection_df.sample(frac=0.60,replace = True))

for x in range(17):
    Address_df = Address_df.append(Address_df.sample(frac=0.60,replace = True))


In [11]:
print(len(CustomerProfileCollection_df))
print(len(Address_df))

2951634
2951634


In [12]:
num_documentos = 100000

In [13]:
CustomerProfileCollection_df = CustomerProfileCollection_df.sample(num_documentos)
print(len(CustomerProfileCollection_df))
CustomerProfileCollection_df.sample(1)

100000


Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone
480,7278,7278,sol,Josephine,Abdullah Swift,Prof.,2,US.OBIE.Principal,mollie03@example.net,7868407036


In [14]:
Address_df = Address_df.sample(num_documentos)
print(len(Address_df))
Address_df.sample(1)

100000


Unnamed: 0,PartyId,AddressType,AddressLine,StreetName,BuildingNumber,PostCode,TownName,CountrySubDivision,CountryId,Country_Code,Country_ShortName,Country_Description
3,611071,West,3246 Veum Coves Suite 975,Orie Circle,465,89545-0393,Hoegerside,9.0,2,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...


In [15]:
sample_mode = True #En cargas masivas reales, poner a false

if(sample_mode):
    list_partyId = CustomerProfileCollection_df['PartyId'].map(lambda x: str(uuid.uuid1()))
    CustomerProfileCollection_df['PartyId'] = list_partyId

In [16]:
CustomerProfileCollection_df.sample(5)

Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone
175,24d573ba-bf20-11ec-ba50-f9047b14a097,4955,pri,Kaycee,Miss Marjorie McDermott MD,Dr.,6,US.OBIE.Principal,vjohns@example.net,09626729974
128,25024d2c-bf20-11ec-ba50-f9047b14a097,9901,pri,Derek,Asia Dickens,Prof.,0,DE.OBIE.Principal,qziemann@example.com,317.384.5984
344,24f9b04a-bf20-11ec-ba50-f9047b14a097,8617,sol,Evelyn,Jaleel Waters,Prof.,4,UK.OBIE.Principal,boyer.eriberto@example.com,505.687.7901
690,24fcc9f6-bf20-11ec-ba50-f9047b14a097,3940,pri,Celestine,Sasha Leuschke,Mr.,9,DE.OBIE.Principal,metz.elwyn@example.com,(387)317-808
435,24ebb2ba-bf20-11ec-ba50-f9047b14a097,8091,sol,Charity,Jermaine Hoppe Jr.,Prof.,2,US.OBIE.Principal,jenkins.trisha@example.org,(364)681-107


In [17]:
del(Address_df['PartyId'])
Address_df.sample(5)

Unnamed: 0,AddressType,AddressLine,StreetName,BuildingNumber,PostCode,TownName,CountrySubDivision,CountryId,Country_Code,Country_ShortName,Country_Description
15,Lake,76173 Bartell Mill Suite 539,Bogisich Light,7709,03999,Johnstonton,4393.0,1,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...
330,Port,51148 Hammes Course Suite 950,Osvaldo Mount,80646,13281-2733,Gaylordside,84563.0,2,GBP,USA,Id dolorem a soluta harum iusto qui repellat. ...
604,Nort,60371 Gorczany Keys,Corwin Wall,604,13494-0031,New Eduardostad,46098.0,1,GBP,USA,Id dolorem a soluta harum iusto qui repellat. ...
835,Sout,83420 Rico Fields,Maryjane Vista,71577,12918,East Gailbury,3260878.0,3,IND,Spain,Omnis dolore id dicta consequatur sit omnis. E...
695,Port,864 Swaniawski Light,Cordie Stream,4625,55382,Gusikowskiberg,26521.0,2,GBP,USA,Id dolorem a soluta harum iusto qui repellat. ...


In [18]:
import random

def getRandomAddresList():
    address_list = []
    num_add = random.randint(1, 3)
    for i in range(1,num_add):        
        AddressSample = Address_df.sample(1);
        tuple_add = (
            str(AddressSample['AddressType'].iloc[0]).replace("'",""),
            str(AddressSample['AddressLine'].iloc[0]).replace("'",""),
            str(AddressSample['StreetName'].iloc[0]).replace("'",""),
            str(AddressSample['TownName'].iloc[0]).replace("'",""),
            str(AddressSample['BuildingNumber'].iloc[0]).replace("'",""),
            str(AddressSample['PostCode'].iloc[0]).replace("'",""),
            str(AddressSample['CountrySubDivision'].iloc[0]).replace("'",""),
            str(AddressSample['Country_Code'].iloc[0]).replace("'",""),
            str(AddressSample['Country_ShortName'].iloc[0]).replace("'",""),
            str(AddressSample['Country_Description'].iloc[0]).replace("'","")
        )        
        address_list.append(tuple_add)    
        
    return address_list


In [19]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra import concurrent

registers = []

#Cargas Masiva con Many
def loadCollectionDataFrameCustomerProfile(df,keyspace):
    session = cluster.connect(keyspace)
    BATCH_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    for h,item in df.iterrows(): 
        INSERT_STMT = """INSERT INTO CustomerProfile (partyid, partynumber, partytype, name, fulllegalname, legalstructure, beneficialownership, accountrole, emailaddress, phone, address) VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', {});""".format(
            item[0].replace("'",""),
            item[1],
            item[2].replace("'",""),
            item[3].replace("'",""),
            item[4].replace("'",""),
            item[5].replace("'",""),
            item[6],
            item[7].replace("'",""),
            item[8].replace("'",""),
            item[9].replace("'",""),
            getRandomAddresList()
        )
        session.execute_async(INSERT_STMT, trace = False)
                
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH   
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))            
            iter += BATCH_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [20]:
session.execute('TRUNCATE CustomerProfileKeySpace.CustomerProfile;')

<cassandra.cluster.ResultSet at 0x7f57cd35a8d0>

In [21]:
registers = loadCollectionDataFrameCustomerProfile(CustomerProfileCollection_df, 'customerprofilekeyspace')

In [22]:
registers

[(0, 2.145, 32.9, 89.8),
 (1000, 2.026, 31.0, 89.9),
 (2000, 2.088, 30.3, 89.9),
 (3000, 2.081, 27.7, 89.8),
 (4000, 2.086, 30.0, 89.8),
 (5000, 2.06, 27.9, 89.6),
 (6000, 1.976, 29.1, 89.6),
 (7000, 2.06, 30.2, 89.6),
 (8000, 2.023, 27.3, 89.6),
 (9000, 2.021, 26.5, 89.6),
 (10000, 1.959, 24.9, 89.6),
 (11000, 2.106, 28.4, 89.6),
 (12000, 2.06, 27.2, 89.6),
 (13000, 2.1, 26.7, 89.6),
 (14000, 2.087, 29.4, 89.6),
 (15000, 2.052, 24.0, 89.6),
 (16000, 1.98, 24.8, 89.6),
 (17000, 2.055, 25.8, 89.6),
 (18000, 2.075, 25.5, 89.6),
 (19000, 2.028, 25.6, 89.6),
 (20000, 2.064, 27.5, 89.6),
 (21000, 2.055, 26.6, 89.6),
 (22000, 2.023, 26.9, 89.6),
 (23000, 2.092, 35.1, 89.8),
 (24000, 2.061, 28.8, 89.7),
 (25000, 2.055, 27.0, 89.7),
 (26000, 2.049, 24.6, 89.7),
 (27000, 2.032, 25.8, 89.7),
 (28000, 2.11, 25.2, 89.7),
 (29000, 1.995, 24.7, 89.6),
 (30000, 2.06, 26.1, 89.6),
 (31000, 2.053, 29.3, 89.6),
 (32000, 2.088, 27.1, 89.6),
 (33000, 2.027, 26.4, 89.6),
 (34000, 2.002, 29.3, 89.6),
 (3500

In [23]:
save_results_to_csv(registers, resultados_etl_CutomerProfileKeySpace, 'ALL')

../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_ALL_CustomerProfile_18042022_16_05_39.csv


## CurrentAccount

In [24]:
session = cluster.connect('currentaccountkeyspace')
session.execute('TRUNCATE TABLE currentaccount;')

<cassandra.cluster.ResultSet at 0x7f57cd64d210>

In [25]:
CurrentAccount_df = pd.read_csv(CurrentAccountKeyspace_sample_out) #Carga Sample
AccountInfo_df = pd.read_csv(CurrentAccountKeyspaceAccountInfo_sample_out) #Carga Sample

In [26]:
del(AccountInfo_df['AccountId'])
AccountInfo_df.sample(1)

Unnamed: 0,SchemeName,Identification,Name
128,UK.OBIE.SortCodeAccountNumber,3.0,"Macejkovic, Ondricka and Brekke"


In [27]:
for x in range(17): # Generamos un millon de registros aprox
    CurrentAccount_df = CurrentAccount_df.append(CurrentAccount_df.sample(frac=0.60,replace = True))

In [28]:
print(len(CurrentAccount_df))
print(len(AccountInfo_df))

2951634
250


In [29]:
del(CurrentAccount_df['PartyId'])
CurrentAccount_df = CurrentAccount_df.sample(num_documentos)
len(CurrentAccount_df)

100000

In [30]:
sample_mode = True #En cargas masivas reales, poner a false

if(sample_mode):     
    list_ca = CurrentAccount_df['AccountId'].map(lambda x: str(uuid.uuid1()))
    CurrentAccount_df['AccountId'] = list_ca
    CurrentAccount_df.set_index('AccountId')


In [31]:
CurrentAccount_df['PartyId'] = list_partyId.values
shiftPos = CurrentAccount_df.pop('PartyId')
CurrentAccount_df.insert(0, 'PartyId', shiftPos)

In [32]:
CurrentAccount_df.sample(3)

Unnamed: 0,PartyId,AccountId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType
83,24ff86f0-bf20-11ec-ba50-f9047b14a097,a203c76a-bf20-11ec-ba50-f9047b14a097,Disabled,2015-04-20 15:42:56.00,Business,expedite customized mindshare,1998-09-10 09:30:03.00,POI
926,24de05e8-bf20-11ec-ba50-f9047b14a097,a1dac27a-bf20-11ec-ba50-f9047b14a097,Enabled,2019-02-23 07:57:15.00,Particular,cultivate customized e-markets,2017-08-04 00:10:05.00,POI
727,24fbca38-bf20-11ec-ba50-f9047b14a097,a1fff7d4-bf20-11ec-ba50-f9047b14a097,Disabled,2002-02-17 04:19:04.00,Business,exploit e-business convergence,1975-12-01 18:19:00.00,EDP


In [33]:
import random

def getRandomAccountInfoList():
    ai_list = []
    num_add = random.randint(1, 3)
    for i in range(1,num_add):        
        ai_sample = AccountInfo_df.sample(1);       
        tuple_add = (            
            str(ai_sample['Name'].iloc[0]).replace("'",""),
            str(ai_sample['Identification'].iloc[0]).replace("'",""),
            str(ai_sample['SchemeName'].iloc[0]).replace("'","")
        ) 
        ai_list.append(tuple_add)    
    return ai_list

In [34]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra import concurrent

registers = []

#Cargas Masiva con Many
def loadCollectionDataFrameCurrentAccount(df,keyspace):
    session = cluster.connect(keyspace)
    BATCH_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    for h,item in df.iterrows(): 
        INSERT_STMT = """INSERT INTO CurrentAccount (partyid, accountid, status, statusupdatedatetime, accountType, nickName, openingDate, accountinfo ) VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', {} );""".format(            
            item[0].replace("'",""),
            item[1].replace("'",""),
            item[2].replace("'",""),
            item[3].replace("'",""),
            item[4].replace("'",""),
            item[5].replace("'",""),
            item[6].replace("'",""),
            getRandomAccountInfoList()
        )
        #print(INSERT_STMT)
        session.execute(INSERT_STMT, trace = False)
        
                
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH   
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))            
            iter += BATCH_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [35]:
registers = loadCollectionDataFrameCurrentAccount(CurrentAccount_df, 'currentaccountkeyspace')

In [36]:
registers

[(0, 1.79, 27.7, 91.7),
 (1000, 2.101, 25.2, 91.7),
 (2000, 1.848, 25.6, 91.7),
 (3000, 2.382, 26.2, 91.6),
 (4000, 2.582, 26.3, 91.6),
 (5000, 2.458, 28.6, 91.6),
 (6000, 1.547, 23.4, 91.6),
 (7000, 2.01, 26.3, 91.6),
 (8000, 1.741, 25.5, 91.6),
 (9000, 1.916, 25.8, 91.6),
 (10000, 2.042, 24.9, 91.6),
 (11000, 2.208, 25.7, 91.6),
 (12000, 2.124, 25.7, 91.6),
 (13000, 2.289, 26.2, 91.6),
 (14000, 1.963, 27.4, 91.6),
 (15000, 2.049, 25.4, 91.6),
 (16000, 1.904, 27.3, 91.6),
 (17000, 2.265, 27.1, 91.6),
 (18000, 1.651, 29.5, 91.6),
 (19000, 1.801, 27.8, 91.6),
 (20000, 1.738, 27.7, 91.6),
 (21000, 1.646, 30.3, 91.6),
 (22000, 1.704, 27.3, 91.6),
 (23000, 1.503, 26.4, 91.6),
 (24000, 1.634, 27.7, 91.6),
 (25000, 1.393, 27.6, 91.6),
 (26000, 2.156, 26.6, 91.6),
 (27000, 1.936, 28.1, 91.6),
 (28000, 2.177, 25.4, 91.6),
 (29000, 1.679, 24.1, 91.6),
 (30000, 1.815, 24.8, 91.6),
 (31000, 1.859, 26.1, 91.6),
 (32000, 1.916, 24.3, 91.6),
 (33000, 1.882, 23.7, 91.6),
 (34000, 1.944, 24.5, 91.6),


In [37]:
session = cluster.connect()
session.execute('USE currentaccountkeyspace')
rows = session.execute('SELECT count(*) FROM CurrentAccount')
for row in rows:
    print(row)

(100000,)


In [38]:
CustomerProfileCollection_df = pd.read_csv(CustomerProfileKeyspaceCustomer_sample_out) #Carga Sample
Address_df = pd.read_csv(CustomerProfileKeyspaceAddress_sample_out)

In [39]:
PositionKeeping_df = pd.read_csv(PositionKeepingKeyspace_sample_out) #Carga Sample
PositionKeeping_df.sample(1)

Unnamed: 0,AccountId,DateTime,CreditDebitIndicator,Type,Amount_Amount,Amount_Currency_Description,Amount_Currency_Code,CreditLine_Amount,CreditLine_Currency_Description,CreditLine_Currency_Code,CreditLine_Type,CreditLine_Included
190,4555,1993-09-23 00:41:03.00,Debit,Particular,773327.9,Ut cumque sint laudantium quis impedit. Qui do...,IND,1266.0,Ut cumque sint laudantium quis impedit. Qui do...,EUR,,0


In [40]:
for x in range(17): # Generamos un millon de registros aprox
    PositionKeeping_df = PositionKeeping_df.append(PositionKeeping_df.sample(frac=0.60,replace = True))

In [41]:
PositionKeeping_df = PositionKeeping_df.sample(num_documentos)

In [42]:
PositionKeeping_df['AccountId'] = list_ca.values
PositionKeeping_df.sample(1)

Unnamed: 0,AccountId,DateTime,CreditDebitIndicator,Type,Amount_Amount,Amount_Currency_Description,Amount_Currency_Code,CreditLine_Amount,CreditLine_Currency_Description,CreditLine_Currency_Code,CreditLine_Type,CreditLine_Included
777,a1bfcb3c-bf20-11ec-ba50-f9047b14a097,1989-04-21 23:13:26.00,Credit,Business,88483150.0,Dolores sint vero vero maiores nisi. Et quas i...,EUR,21.642,Ut necessitatibus in iure cum et sint. Minima ...,EUR,,0


In [None]:
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
from cassandra import concurrent

registers = []

#Cargas Masiva con Many
def loadCollectionDataFramePositionKeeping(df,keyspace):
    session = cluster.connect(keyspace)
    BATCH_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    for h,item in df.iterrows(): 
        #print(item)
        INSERT_STMT = """INSERT INTO PositionKeeping (
        accountid,
        datetime,
        creditdebtindicator,
        type,
        amount_amount,
        amount_currency_description,
        amount_currency_code, 
        credit_line_amount, 
        credit_line_currency_code, 
        creditline_type, 
        creditline_included) 
        VALUES ({}, '{}', '{}', '{}', {}, '{}', '{}', {}, '{}', '{}' , {});""".format(            
            item[0].replace("'",""),
            item[1].replace("'",""),
            item[2].replace("'",""),
            item[3].replace("'",""),
            item[4],
            item[5].replace("'",""),
            item[6].replace("'",""),
            item[7],
            item[9].replace("'",""),
            item[10],
            bool(item[11]))
        #print(INSERT_STMT)
        session.execute(INSERT_STMT, trace = False)
        
                
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH   
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))            
            iter += BATCH_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [65]:
registers = loadCollectionDataFramePositionKeeping(PositionKeeping_df, 'positionkeepingkeyspace')

In [66]:
registers

[(0, 1.706, 7.1, 93.2),
 (1000, 1.744, 30.2, 93.2),
 (2000, 0.876, 28.5, 93.2),
 (3000, 0.891, 28.7, 93.2),
 (4000, 1.327, 32.3, 93.3),
 (5000, 0.776, 22.7, 93.3),
 (6000, 0.766, 19.4, 93.3),
 (7000, 1.396, 24.7, 93.2),
 (8000, 1.998, 32.8, 93.2),
 (9000, 1.869, 25.9, 93.2),
 (10000, 1.688, 25.2, 93.2),
 (11000, 1.73, 25.0, 93.2),
 (12000, 1.619, 24.9, 93.2),
 (13000, 0.761, 23.2, 93.2),
 (14000, 0.736, 19.4, 93.2),
 (15000, 0.847, 21.8, 93.2),
 (16000, 1.761, 25.0, 93.2),
 (17000, 1.903, 27.9, 93.2),
 (18000, 1.494, 27.5, 93.2),
 (19000, 2.053, 26.2, 93.2),
 (20000, 2.044, 26.1, 93.2),
 (21000, 2.046, 25.4, 93.1),
 (22000, 2.049, 25.2, 93.2),
 (23000, 2.003, 26.1, 93.1),
 (24000, 2.34, 25.7, 93.1),
 (25000, 2.354, 26.2, 93.1),
 (26000, 2.357, 26.3, 93.1),
 (27000, 2.036, 27.2, 93.2),
 (28000, 3.243, 28.6, 93.2),
 (29000, 1.781, 36.9, 93.2),
 (30000, 1.766, 31.1, 93.2),
 (31000, 1.309, 29.2, 93.1),
 (32000, 1.597, 32.0, 93.2),
 (33000, 2.078, 26.7, 93.1),
 (34000, 1.906, 32.8, 93.1),
 

In [126]:
from cassandra.query import tuple_factory
from cassandra.query import dict_factory
from cassandra.query import BatchStatement, SimpleStatement
from cassandra.policies import RetryPolicy

batch = BatchStatement(consistency_level=ConsistencyLevel.ALL)

def loadCollectionDataFramePositionKeepingBatch(df,keyspace):
    registers = []
    session = cluster.connect(keyspace)
    BATCH_SIZE = 50
    TRX_SIZE = 1000
    iter = 0;
    i = 1
    time_inicial = time.time()
    for h,item in df.iterrows(): 
        #print(item)
        values = ( 
            str(item[0].replace("'","")),
            str(item[1].replace("'","")),
            str(item[2].replace("'","")),
            str(item[3].replace("'","")),
            item[4],
            str(item[5].replace("'","")),
            str(item[6].replace("'","")),
            item[7],
            str(item[9].replace("'","")),
            'typed',
            bool(item[11]))
        INSERT_STMT = """INSERT INTO PositionKeeping (
        accountid,
        datetime,
        creditdebtindicator,
        type,
        amount_amount,
        amount_currency_description,
        amount_currency_code, 
        credit_line_amount, 
        credit_line_currency_code, 
        creditline_type, 
        creditline_included) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s , %s);"""
        
        #print(INSERT_STMT)
        batch.add(INSERT_STMT, values)
                        
        if(i % BATCH_SIZE == 0 and i > 0):
            ### FIN BATCH
            session = cluster.connect(keyspace)
            session.execute_async(batch) 
            batch.clear()
        if(i % TRX_SIZE == 0 and i > 0):
            time_final = time.time()           
            data_time_collection = round(time_final - time_inicial,3)
            used_cpu = psutil.cpu_percent()
            mem_used = psutil.virtual_memory().percent
            registers.append((iter,data_time_collection,used_cpu,mem_used))            
            iter += TRX_SIZE;
            time_inicial = time.time()
        i = i + 1
    return registers

In [127]:
def f(x):
    try:
        return np.float(x)
    except:
        return np.nan
    
PositionKeeping_df.fillna(0, inplace=True)

PositionKeeping_df["CreditLine_Amount"] = PositionKeeping_df["CreditLine_Amount"].apply(f)
PositionKeeping_df["Amount_Amount"] = PositionKeeping_df["Amount_Amount"].apply(f)


session = cluster.connect('positionkeepingkeyspace')
session.execute('TRUNCATE TABLE positionkeeping;')
registers = loadCollectionDataFramePositionKeepingBatch(PositionKeeping_df, 'positionkeepingkeyspace')

In [128]:
registers

[(0, 0.251, 7.8, 94.3),
 (1000, 0.244, 33.8, 94.3),
 (2000, 0.221, 36.9, 94.3),
 (3000, 0.223, 36.7, 94.3),
 (4000, 0.239, 39.7, 94.3),
 (5000, 0.25, 30.1, 94.3),
 (6000, 0.226, 35.2, 94.3),
 (7000, 0.226, 35.2, 94.3),
 (8000, 0.225, 36.1, 94.3),
 (9000, 0.224, 35.8, 94.3),
 (10000, 0.248, 34.8, 94.3),
 (11000, 0.23, 35.7, 94.3),
 (12000, 0.225, 36.0, 94.3),
 (13000, 0.243, 42.5, 94.3),
 (14000, 0.231, 36.5, 94.3),
 (15000, 0.228, 36.5, 94.3),
 (16000, 0.228, 38.6, 94.3),
 (17000, 0.226, 35.5, 94.3),
 (18000, 0.241, 43.4, 94.3),
 (19000, 0.324, 33.9, 94.3),
 (20000, 0.226, 37.2, 94.3),
 (21000, 0.229, 37.4, 94.3),
 (22000, 0.229, 40.0, 94.3),
 (23000, 0.233, 39.2, 94.4),
 (24000, 0.235, 37.7, 94.4),
 (25000, 0.229, 39.5, 94.4),
 (26000, 0.224, 36.9, 94.4),
 (27000, 0.232, 35.9, 94.4),
 (28000, 0.228, 39.1, 94.4),
 (29000, 0.231, 37.6, 94.4),
 (30000, 0.23, 37.0, 94.4),
 (31000, 0.254, 40.7, 94.4),
 (32000, 0.232, 36.2, 94.4),
 (33000, 0.229, 38.5, 94.4),
 (34000, 0.231, 38.8, 94.4),
 (

In [129]:
#Guardamos el caso one-to-one cassandra
save_results_to_csv(registers,resultados_etl_PositionKeepingKeySpace,'ALL')

../Results/Cassandra/Cassandra_Insert_ConsistencyLevel_ALL_PositionKeeping_18042022_22_24_49.csv
