In [1]:
import pandas as pd
import numpy as np
import hashlib


'''
    we drop the columns we are not working with
'''
df = pd.read_csv('dataBlock.csv', sep=';')
df = df.drop(['elementHeight', 'elementWidth','element_id','rap_size'], axis=1)
df[:3]

Unnamed: 0,theme,block,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type
0,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,73.3125,15.0,74.1875,15.75,vector
1,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,70.1875,14.9375,70.875,15.8125,vector
2,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,66.8125,14.9375,67.6875,15.8125,vector


In [3]:
""" for convenience concatenate 'theme' columns with 'block' """
df['ways'] = df['theme'] +'/'+ df['block']  
df.drop(['theme', 'block'], axis=1, inplace=True )
df[:3]

Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways
0,316.0,10,73.3125,15.0,74.1875,15.75,vector,zephyr-blocks/text_on_the_top_and_social_media...
1,316.0,10,70.1875,14.9375,70.875,15.8125,vector,zephyr-blocks/text_on_the_top_and_social_media...
2,316.0,10,66.8125,14.9375,67.6875,15.8125,vector,zephyr-blocks/text_on_the_top_and_social_media...


In [18]:
w, h = 27, 27

'''
    which columns will be divided by how much. If the column has the value None, it will not be divided
'''
col_div = {
           'globalBlockHeight' : h, 
           'x_orig': w, 
           'y_orig': h,
           'x_end': w,
           'y_end': h
          }

df_new = df.copy()
''' divizarea blocului in sub-blocuri mai mici de marimea w*h'''
for col_key in col_div:
    if col_div[col_key] != None:
        df_new[col_key] = df_new[col_key]/col_div[col_key]
        df_new[col_key] = df_new[col_key].apply(np.floor)

df_new[:3]

Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...


In [26]:
''' Creating the hash for each block, which is a block footprint. Where it helps us compare blocks.'''

ways_uniq = df['ways'].unique()
params_uniq = ['x_orig','y_orig', 'x_end', 'y_end', 'type']
df_new['block_hash'] =''
count = 0

for way in ways_uniq:
    block = df_new.loc[ df_new['ways'] == way, : ]
    count += len(block)
    block_item = block[params_uniq]
    block_item = np.concatenate(block_item.values)
    
    block_item = ''.join( [ str(el) for el in block_item ] )
    block_hash = hashlib.sha256(block_item.encode('utf-8')).hexdigest()
    infoBlocks[way] = block_hash
    df_new.loc[df_new['ways'] == way ,'block_hash'] = block_hash


print(f'the number of blocks: {len(ways_uniq)}\nthe number of items: {count}')
df_new[:3]

the number of blocks: 589
the number of items: 6801


Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways,block_hash,elemt_hash
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...


In [27]:
'''Creating the hash for each element, through them we will compare the elements'''

for idx, row in df_new[['x_orig','y_orig','x_end','y_end','type']].iterrows():
    row_str = ''.join( [ str(el) for el in row.values] )
    row_hash = hashlib.sha256(row_str.encode('utf-8')).hexdigest()
    df_new.loc[idx, 'elemt_hash'] = row_hash
    
df_new[:3]


Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways,block_hash,elemt_hash
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...


In [6]:
f = open(f'data_aproxim_{h}.csv','a')
cols = ['block_1','block_2', 'nummberOfElements_1','nummberOfElements_2','common_elements','percent_1','percent_2']
f.write(';'.join(cols) + '\n')
f.close()

bls_ways = df_new['ways'] # all ways blocks
ways1 = []
ways2 = []

total_elements1 = []
total_elements2 = []

com_elem = []

percent_1 = []
percent_2 = []

count = 0
was = []
for bl_w in df_new['ways'].unique():
    count += 1
    
    if count % 10 == 0:
        print(f'sau procesat {count} blocuri \n')
    
    '''info despre blocul care va fi comparat cu celelalte blocuri'''
    bl = df_new.loc[ df_new['ways'] == bl_w , : ]
    els_hash = np.array(bl['el_hash'])
    bl_te = int( bl['total_elements'].unique()[0] )

#     print(els_hash, els_te, bl_w, sep='\n')
    '''list cu blocurile cu care va fi comparat blocul de mai sus'''
    
    other_bls_w = df_new.loc[ df_new['ways'] != bl_w , 'ways'].unique()
    was += [bl_w]
    for o_bl_w in other_bls_w:
        
        if o_bl_w in was:
            continue
        
        o_bl = df_new.loc[ (df_new['ways'] == o_bl_w ), : ]
        o_els_hash = np.array( o_bl['el_hash'] )
        o_bl_te = int(o_bl['total_elements'].unique()[0])

#     print(o_els_hash, o_els_theme, o_els_block, o_way, sep='\n')
        com_els = np.intersect1d(els_hash, o_els_hash)
        
        if len(com_els) > 0:
            com_els = len(com_els)
            try:
                ways1 += [bl_w]
                ways2 += [o_bl_w]

                total_elements1 += [bl_te]
                total_elements2 += [o_bl_te]

                com_elem += [com_els]
                
                aprox_1 = round( com_els / bl_te * 100, 1)
                percent_1 += [aprox_1]

                aprox_2 = round( com_els / o_bl_te * 100, 1)
                percent_2 += [aprox_2]

                row = [bl_w, o_bl_w, bl_te, o_bl_te, com_els, aprox_1, aprox_2]
                row = [ str(el) for el in row ]
                row = ';'.join(row)
                f = open(f'data_aproxim_{h}.csv','a')
                f.write(row)
                f.write("\n")
                f.close()
                
            except:
                log = open('log_da_{h}.csv','a')
                log.write(f'{bl_w},{o_bl_w}')
                log.write("\n")
                log.close() 
            

            
            

# print('finisat')


Current Time = 15:10:02
sau procesat 10 blocuri 

sau procesat 20 blocuri 

sau procesat 30 blocuri 

sau procesat 40 blocuri 

sau procesat 50 blocuri 

sau procesat 60 blocuri 

sau procesat 70 blocuri 

sau procesat 80 blocuri 

sau procesat 90 blocuri 

sau procesat 100 blocuri 

sau procesat 110 blocuri 

sau procesat 120 blocuri 

sau procesat 130 blocuri 

sau procesat 140 blocuri 

sau procesat 150 blocuri 

sau procesat 160 blocuri 

sau procesat 170 blocuri 

sau procesat 180 blocuri 

sau procesat 190 blocuri 

sau procesat 200 blocuri 

sau procesat 210 blocuri 

sau procesat 220 blocuri 

sau procesat 230 blocuri 

sau procesat 240 blocuri 

sau procesat 250 blocuri 

sau procesat 260 blocuri 

sau procesat 270 blocuri 

sau procesat 280 blocuri 

sau procesat 290 blocuri 

sau procesat 300 blocuri 

sau procesat 310 blocuri 

sau procesat 320 blocuri 

sau procesat 330 blocuri 

sau procesat 340 blocuri 

sau procesat 350 blocuri 

sau procesat 360 blocuri 

sau procesat 

In [10]:
data_aprox = {'block_1':ways1,'block_2':ways2, 
              'num_elem_1':total_elements1,'num_elem_2':total_elements2,
              'com_elem':com_elem,'percent_1':percent_1,'percent_2':percent_2 } 
df_aprox_csv = pd.DataFrame(data_aprox) 
df_aprox_csv = df_aprox_csv.sort_values(['percent_1','percent_2','com_elem',], ascending=False)
df_aprox_csv.to_csv(f'final_aprox_{h}.csv',sep=';', index=False)


In [11]:
df_aprox_csv.to_excel(f'aproxim_{h}.xlsx', engine='xlsxwriter', index=False)  
