In [1]:
import pandas as pd
import numpy as np
import hashlib

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000

'''
    we drop the columns we are not working with
'''
df = pd.read_csv('dataBlock.csv', sep=';')
df = df.drop(['elementHeight', 'elementWidth','element_id','rap_size'], axis=1)
df[:15]

Unnamed: 0,theme,block,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type
0,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,73.3125,15.0,74.1875,15.75,vector
1,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,70.1875,14.9375,70.875,15.8125,vector
2,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,66.8125,14.9375,67.6875,15.8125,vector
3,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,63.8125,14.9375,64.1875,15.8125,vector
4,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,5.625,15.0,24.0625,16.0,text
5,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,5.625,13.1875,74.375,13.25,shape
6,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,35.3125,5.625,58.6875,9.375,text
7,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,35.3125,3.75,47.1875,4.75,text
8,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,5.625,5.625,24.625,9.375,text
9,zephyr-blocks,text_on_the_top_and_social_media_links_on_the_...,316.0,10,5.625,3.75,17.5,4.75,text


In [2]:
""" for convenience concatenate 'theme' columns with 'block' """
df['ways'] = df['theme'] +'/'+ df['block']  
df.drop(['theme', 'block'], axis=1, inplace=True )
df[:3]

Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways
0,316.0,10,73.3125,15.0,74.1875,15.75,vector,zephyr-blocks/text_on_the_top_and_social_media...
1,316.0,10,70.1875,14.9375,70.875,15.8125,vector,zephyr-blocks/text_on_the_top_and_social_media...
2,316.0,10,66.8125,14.9375,67.6875,15.8125,vector,zephyr-blocks/text_on_the_top_and_social_media...


In [3]:
w, h = 27, 27

'''
    which columns will be divided by how much. If the column has the value None, it will not be divided
'''
col_div = {
           'globalBlockHeight' : h, 
           'x_orig': w, 
           'y_orig': h,
           'x_end': w,
           'y_end': h
          }

df_new = df.copy()
''' divizarea blocului in sub-blocuri mai mici de marimea w*h'''
for col_key in col_div:
    if col_div[col_key] != None:
        df_new[col_key] = df_new[col_key]/col_div[col_key]
        df_new[col_key] = df_new[col_key].apply(np.floor)

df_new[:3]

Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...


In [4]:
''' Creating the hash for each block, which is a block footprint. Where it helps us compare blocks.'''

ways_uniq = df['ways'].unique()
params_uniq = ['x_orig','y_orig', 'x_end', 'y_end', 'type']
df_new['block_hash'] =''
count = 0

for way in ways_uniq:
    block = df_new.loc[ df_new['ways'] == way, : ]
    count += len(block)
    block_item = block[params_uniq]
    block_item = np.concatenate(block_item.values)
    
    block_item = ''.join( [ str(el) for el in block_item ] )
    block_hash = hashlib.sha256(block_item.encode('utf-8')).hexdigest()
    df_new.loc[df_new['ways'] == way ,'block_hash'] = block_hash


print(f'the number of blocks: {len(ways_uniq)}\nthe number of items: {count}')
df_new[:3]

the number of blocks: 589
the number of items: 6801


Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways,block_hash
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...


In [5]:
'''Creating the hash for each element, through them we will compare the elements'''

for idx, row in df_new[['x_orig','y_orig','x_end','y_end','type']].iterrows():
    row_str = ''.join( [ str(el) for el in row.values] )
    row_hash = hashlib.sha256(row_str.encode('utf-8')).hexdigest()
    df_new.loc[idx, 'element_hash'] = row_hash
    
df_new[:3]

Unnamed: 0,globalBlockHeight,total_elements,x_orig,y_orig,x_end,y_end,type,ways,block_hash,element_hash
0,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
1,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...
2,11.0,10,2.0,0.0,2.0,0.0,vector,zephyr-blocks/text_on_the_top_and_social_media...,d472689379fc1fb23ff4ce636f00829f6f7de464eb92b5...,032735e38b3e7b4b1ae6a52f915c7238899dff0cd93b06...


In [6]:
print(f'time of comparison has begun. total number of blocks {ways_uniq}')

name_file = f'data_compare_{h}.csv'
f = open(name_file,'a')
cols = ['block_1','block_2', 'nummberOfElements_1','nummberOfElements_2','common_elements']
f.write(';'.join(cols) + '\n')
f.close()

bls_ways = df_new['ways']
ways1 = []
ways2 = []

total_elements1 = []
total_elements2 = []

common_elements = []

percent_1 = []
percent_2 = []

count = 0
was = []
for block_way in df_new['ways'].unique():
    count += 1
    
    if count % 100 == 0:
        print(f'{count} blocks were processed\n')
    
    '''info about the block that will be compared with the other blocks'''
    block = df_new.loc[ df_new['ways'] == block_way , : ]
    element_hash = np.array(block['element_hash'])
    block_size = len( element_hash )

    '''list of blocks with which the block above will be compared'''
    other_block_way = df_new.loc[ df_new['ways'] != block_way , 'ways'].unique()
   
    was += [ block_way ]
    for o_block_way in other_block_way:
        
        if o_block_way in was:
            continue
        
        o_block = df_new.loc[ (df_new['ways'] == o_block_way ), : ]
        o_element_hash = np.array( o_block['element_hash'] )
        o_block_size = len( o_element_hash )
        com_els = np.intersect1d(element_hash, o_element_hash)
        
        if len(com_els) > 0:
            com_els = len(com_els)
            try:
                ways1 += [block_way]
                ways2 += [o_block_way]
                total_elements1 += [ block_size ]
                total_elements2 += [ o_block_size ]
                common_elements += [ com_els ]

                row = ';'.join( [ str(el) for el in [block_way, o_block_way, block_size, o_block_size, com_els] ] )
                
                f = open(name_file,'a')
                f.write(row + '\n')
                f.close()
                
            except:
                log = open(f'logs_{h}.csv','a')
                log.write(f'{block_way}, {o_block_way}\n')
                log.close() 
            
print('time of comparison is over')

time of comparison has begun
100 blocks were processed

200 blocks were processed

300 blocks were processed

400 blocks were processed

500 blocks were processed

time of comparison is over


In [7]:
df = pd.read_csv(name_file, sep=';')

df['common_elements'] = df['common_elements'].astype('int32')
df['nummberOfElements_1'] = df['nummberOfElements_1'].astype('int32')
df['nummberOfElements_2'] = df['nummberOfElements_2'].astype('int32')


df['percent_1'] = round( 100 * df['common_elements']/df['nummberOfElements_1'], 1)
df['percent_2'] = round( 100 * df['common_elements']/df['nummberOfElements_2'], 1)

df[:3]


Unnamed: 0,block_1,block_2,nummberOfElements_1,nummberOfElements_2,common_elements,percent_1,percent_2
0,zephyr-blocks/text_on_the_top_and_social_media...,zephyr-blocks/text_with_link_left_image_with_t...,10,11,1,10.0,9.1
1,zephyr-blocks/text_on_the_top_and_social_media...,zephyr-blocks/3_rows_with_image_and_text_and_c...,10,25,1,10.0,4.0
2,zephyr-blocks/text_on_the_top_and_social_media...,zephyr-blocks/text_with_call_to_action_left_an...,10,13,1,10.0,7.7


In [8]:
df = df.sort_values(['percent_1','percent_2','common_elements',], ascending=False)
df.to_excel(f'data_compare_{h}.xlsx', engine='xlsxwriter', index=False)  