In [1]:


import numpy as np
import pandas as pd 
from pathlib import Path
from pulp import *
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time 
import scipy.stats as stats

from io import BytesIO #apparanetly needed for reading the files from the bucket
from google.cloud import storage

In [2]:
storage_client = storage.Client()
# Define the bucket name and file name
bucket_name = 'memory_represetnations_data'
file_name = 'images_info_multicat.csv' #you can also try to load something that is not within a subfolder: i.e: 'Object and scene categories in brain and behavior/set1.csv'

# Get the bucket and blob objects
bucket = storage_client.get_bucket(bucket_name)

def read_csv_from_bucket(bucket,filename):
    blob = bucket.blob(filename)

    # Read the content of the file into a BytesIO object
    content = blob.download_as_string()
    content = BytesIO(content)

    # Load the content into a pandas DataFrame using pd.read_csv
    df = pd.read_csv(content)
    return df

def save_df_csv_into_bucket(bucket,df,filename):
    output_blob = bucket.blob(filename)
    output_blob.upload_from_string(df.to_csv(index=False), content_type='text/csv')
    
    

In [None]:
#here is how to read it: 
test_df=read_csv_from_bucket(bucket=bucket,filename=file_name)
#here is how to save with a new name: 
test_filename='dekels_bucket_save_test.csv'
save_df_csv_into_bucket(bucket,df,test_filename)


In [3]:
#example on how to list all bucket contents
blobs = bucket.list_blobs()
# Print the names of all the blobs
for blob in blobs:

    print(blob.name)

.ipynb_checkpoints/
Object and scene categories in brain and behavior/set1.csv
Object and scene categories in brain and behavior/set1/0001.jpg
Object and scene categories in brain and behavior/set1/0002.jpg
Object and scene categories in brain and behavior/set1/0003.jpg
Object and scene categories in brain and behavior/set1/0004.jpg
Object and scene categories in brain and behavior/set1/0005.jpg
Object and scene categories in brain and behavior/set1/0006.jpg
Object and scene categories in brain and behavior/set1/0007.jpg
Object and scene categories in brain and behavior/set1/0008.jpg
Object and scene categories in brain and behavior/set1/0009.jpg
Object and scene categories in brain and behavior/set1/0010.jpg
Object and scene categories in brain and behavior/set1/0011.jpg
Object and scene categories in brain and behavior/set1/0012.jpg
Object and scene categories in brain and behavior/set1/0013.jpg
Object and scene categories in brain and behavior/set1/0014.jpg
Object and scene categori

In [4]:
similairy_dfs_paths = ['similarity_df_regular/similarity_between_pairs_VGGbadlands.csv',
'similarity_df_regular/similarity_between_pairs_VGGbridge.csv',
'similarity_df_regular/similarity_between_pairs_VGGgolf_course.csv',
'similarity_df_regular/similarity_between_pairs_VGGhighway.csv',
'similarity_df_regular/similarity_between_pairs_VGGmountain.csv',
'similarity_df_regular/similarity_between_pairs_VGGplayground.csv']

In [5]:
duplicate_images = ['playground_3.pkl','bridge_219.pkl','highway_106.pkl','highway_109.pkl','highway_119.pkl',
                    'golf_course_24.pkl',
                    'highway_121.pkl','highway_130.pkl','highway_140.pkl','highway_157.pkl','highway_170.pkl',
                    'highway_180.pkl','highway_191.pkl','highway_248.pkl','highway_377.pkl','highway_411.pkl','highway_70.pkl','highway_274.pkl','highway_198.pkl',
                    'mountain_119.pkl','mountain_172.pkl','mountain_121.pkl','mountain_45.pkl','mountain_74.pkl','mountain_91.pkl','mountain_194.pkl','mountain_87.pkl','mountain_305.pkl']

In [16]:
similairy_dfs_paths = ['similarity_df_regular/similarity_between_pairs_VGGgolf_course.csv']

In [None]:
results = pd.DataFrame([])
all_selected_images_dict = {}

for cat in similairy_dfs_paths:


    category_name = cat.split('VGG')[-1].split('.')[0]
    print(category_name)

    cur_similarity_df = read_csv_from_bucket(bucket=bucket,filename=cat)

    # removing duplicate images:
    bad_images_cat = [img for img in duplicate_images if category_name in img]
    for bad_img in bad_images_cat:

        bad_idx = cur_similarity_df[(cur_similarity_df['image1'] == bad_img) | (cur_similarity_df['image2'] == bad_img)].index
        cur_similarity_df = cur_similarity_df.drop(index=bad_idx)
        

    cur_similarity_df['image1'] = cur_similarity_df['image1'].str.replace('.pkl','.jpg')
    cur_similarity_df['image2'] = cur_similarity_df['image2'].str.replace('.pkl','.jpg')

    cur_similarity_df['mean_network_sim']=cur_similarity_df[[colname for colname in cur_similarity_df.columns if 'level' in colname]].mean(axis=1)
    all_unique_images=(set(cur_similarity_df['image1'].values) | set(cur_similarity_df['image2'].values))
    number_of_images = len(all_unique_images)
    cur_similarity_df['numeric_image1']=cur_similarity_df['image1'].copy()
    cur_similarity_df['numeric_image2']=cur_similarity_df['image2'].copy()

    names_to_numbers_dict={name:i for (name,i) in zip(all_unique_images,range(number_of_images))}
    numbers_to_names_dict={name:key for key,name in names_to_numbers_dict.items()}
    cur_similarity_df['numeric_image1'].replace(names_to_numbers_dict,inplace=True)
    cur_similarity_df['numeric_image2'].replace(names_to_numbers_dict,inplace=True)

    d_matrix_level_dict = {}
    for j,sim_level in enumerate(['level_0','level_6','mean_network_sim']):
        #get from the network: 
        curr_sim_df_for_correlation=cur_similarity_df.copy()
        d_matrix=np.zeros([number_of_images,number_of_images])
        for i in range(len(curr_sim_df_for_correlation)):
            row=curr_sim_df_for_correlation['numeric_image1'].iloc[i]
            col=curr_sim_df_for_correlation['numeric_image2'].iloc[i]
            val=curr_sim_df_for_correlation[sim_level].iloc[i]
            d_matrix[row,col]=1-val
            d_matrix[col,row]=1-val
            d_matrix[row,row]=0

        d_matrix_level_dict[sim_level] = d_matrix
        
    SIM_M = d_matrix_level_dict['level_6'] - d_matrix_level_dict['level_0']
    SIM_Mean = d_matrix_level_dict['mean_network_sim']


    epsilon_size = [0.05]
    # Define the epsilon value
    for epsilon in epsilon_size:
        print(f'Epsilon size {epsilon}')
        start_time = time.time()

        # Set the size of the matrix
        n = SIM_M.shape[0]
        n_images = n
        images_to_select=10

        # Define the problem
        prob = LpProblem("Image Selection", LpMaximize)
        # Define the decision variables
        x = LpVariable.dicts("x", range(n_images), 0, 1, LpBinary)
        y = LpVariable.dicts("y", (range(n_images), range(n_images)), 0, 1, LpBinary)
        z = LpVariable.dicts("z", (range(n_images), range(n_images)), 0, 1, LpBinary)

        # Define the objective function
        prob += lpSum([SIM_M[i][j]*z[i][j]-SIM_M[i][j]*y[i][j] for i in range(n_images) for j in range(n_images)])
        #make sure only "images_to_select" are selected:
        prob += lpSum([x[i] for i in range(n_images)]) == images_to_select
        #make sure that the averages are close up tp epsilon: 
        prob += lpSum([SIM_Mean[i][j]*y[i][j] for i in range(n_images) for j in range(n_images)]) - lpSum([SIM_Mean[i][j]*z[i][j] for i in range(n_images) for j in range(n_images)])<=epsilon
        prob += lpSum([SIM_Mean[i][j]*y[i][j] for i in range(n_images) for j in range(n_images)]) - lpSum([SIM_Mean[i][j]*z[i][j] for i in range(n_images) for j in range(n_images)])>=-epsilon


        for i in range(n_images):
            prob += lpSum([y[j][i] for j in range(n_images)]) == x[i]
            prob += lpSum([z[j][i] for j in range(n_images)]) == x[i]

            prob += (x[i] + lpSum([y[i][j] for j in range(n_images)]) + lpSum([z[i][j] for j in range(n_images)]) )<= 1

            for j in range(n_images):
                if i == j:
                    prob += z[i][j] == 0
                    prob += y[i][j] == 0
                prob += y[j][i] <= x[i]
                prob += z[j][i] <= x[i]


        # Solve the problem
        prob.solve()
        # Print the status of the problem
        print("Status:", LpStatus[prob.status])
        # Print selected targets and D1 and D2 images 
        selected_target_and_distractors_dict ={}
        for i in range(n_images):
            if x[i].value() == 1:
                #print(f"Target {i}:")
                # high level distractor
                y1 = [j for j in range(n_images) if y[j][i].value() == 1]
                # low level distractor
                z2 = [j for j in range(n_images) if z[j][i].value() == 1]
                #print(f" y1: {y1}")
                #print(f" z2: {z2}")
                selected_target_and_distractors_dict[i] = (z2,y1)

        #print('x:\n',[x[j].value() for j in range(n_images)])
        y_vals=np.array([y[i][j].value() for i in range(n_images) for j in range(n_images)]).reshape([n_images,n_images])
        #print('y:\n',y_vals)
        z_vals=np.array([z[i][j].value() for i in range(n_images) for j in range(n_images)]).reshape([n_images,n_images])
        #print('z:\n',z_vals)
        end_time = time.time()

        selected_target_and_distractors_names_dict = {}
        for key , value in selected_target_and_distractors_dict.items():
            
            # Convert the first value in the tuple to a word using the second dictionary
            word1 = numbers_to_names_dict[value[0][0]]
            word1 = word1.replace('.pkl','.jpg')
            # Convert the second value in the tuple to a word using the second dictionary
            word2 = numbers_to_names_dict[value[1][0]]
            word2 = word2.replace('.pkl','.jpg')
            # Look up the word corresponding to the key in the second dictionary
            new_key = numbers_to_names_dict[key]
            new_key = new_key.replace('.pkl','.jpg')
            # Add the new key-value pair to the new dictionary
            selected_target_and_distractors_names_dict[new_key] = word1, word2


        sim_df_selected = pd.DataFrame([])
        for key,values in selected_target_and_distractors_names_dict.items():
            target_sim_df = cur_similarity_df[(cur_similarity_df['image1']== key) | (cur_similarity_df['image2'] == key)]

            dist_0_sim_df = target_sim_df[(target_sim_df['image1']== values[0]) | (target_sim_df['image2'] == values[0])]
            dist_0_sim_df['distractor_level'] = 0
            dist_1_sim_df = target_sim_df[(target_sim_df['image1']== values[1]) | (target_sim_df['image2'] == values[1])]
            dist_1_sim_df['distractor_level'] = 1
            sim_df_selected = pd.concat([sim_df_selected,dist_0_sim_df,dist_1_sim_df])


        elapsed_time = (end_time - start_time)/60
        mean_sim_per_layer = sim_df_selected.groupby('distractor_level')[['level_0','level_1','level_2','level_3','level_4','level_5','level_6']].mean()
        mean_similarity = mean_sim_per_layer.mean(axis=1)
        
        d1 = sim_df_selected[sim_df_selected['distractor_level'] == 0 ]['mean_network_sim']
        d2 = sim_df_selected[sim_df_selected['distractor_level'] == 1 ]['mean_network_sim']

        t_stat, p_value = stats.ttest_rel(d1, d2)
        cur_run_line = pd.DataFrame([{'category name':category_name, 'epsilon':epsilon,'d1 mean': mean_similarity[0],'d2 mean': mean_similarity[1],'elapse time':elapsed_time,'p val':p_value,'t stat':t_stat}])
        results = pd.concat([results,cur_run_line])
        key_name = category_name +'_'+ str(epsilon)
        all_selected_images_dict[key_name] = sim_df_selected


golf_course




Epsilon size 0.05


In [None]:
cur_similarity_df[cur_similarity_df['mean_network_sim'] > 0.89]

In [None]:
results

In [None]:
import seaborn as sns
fig,axes = plt.subplots(2,3,figsize = (12,10))
axes = axes.ravel()
numbers = []
for i in range(1,11):
    numbers.append(i)
    numbers.append(i)


for i,cat in enumerate(all_selected_images_dict.keys()):
    df = all_selected_images_dict[cat]
    df['ID'] = numbers
    sns.pointplot(data=df,x='distractor_level', y='mean_network_sim', hue='ID',ax=axes[i])
    axes[i].set_title(cat)
    axes[i].legend('')
   

In [None]:
colors = ['red','blue','green', 'red','orange','purple']
fig,ax = plt.subplots()
for i,cat in enumerate(epsilon_2_cat):
    df = all_selected_images_dict[cat]
    y = df.groupby('distractor_level')[['level_0','level_1','level_2','level_3','level_4','level_5','level_6']].mean()
    
    y_std = df.groupby('distractor_level')[['level_0','level_1','level_2','level_3','level_4','level_5','level_6']].std()
    x = ['level_0','level_1','level_2','level_3','level_4','level_5','level_6']
    ax.plot(x, y, label=cat,color=colors[i])
    ax.fill_between(x, y - y_std, y + y_std,color=colors[i], alpha=0.1)


ax.legend()

In [None]:
save_df_csv_into_bucket(bucket,results,'results_epsilon_0.05.csv')

In [None]:
save_df_csv_into_bucket(bucket,all_selected_images_dict,'dict_epsilon_0.1.csv')

In [9]:
all_selected_images_dict

{'bridge_0.05':                image1          image2   level_0   level_1   level_2   level_3  \
 8441   bridge_128.jpg   bridge_61.jpg  0.413892  0.302557  0.214883  0.166027   
 8378   bridge_128.jpg  bridge_244.jpg  0.199434  0.201220  0.145456  0.135630   
 20812  bridge_184.jpg  bridge_191.jpg  0.397140  0.350096  0.275850  0.211039   
 22281  bridge_191.jpg   bridge_75.jpg  0.276783  0.247402  0.172541  0.128064   
 24261  bridge_202.jpg   bridge_31.jpg  0.443028  0.366328  0.244194  0.163300   
 24268  bridge_202.jpg   bridge_38.jpg  0.240852  0.220152  0.154202  0.109590   
 20422  bridge_181.jpg   bridge_65.jpg  0.527863  0.387840  0.304280  0.266354   
 34612   bridge_60.jpg   bridge_65.jpg  0.194153  0.205021  0.178971  0.117825   
 31131  bridge_253.jpg   bridge_91.jpg  0.476786  0.389937  0.269220  0.164992   
 35349   bridge_82.jpg   bridge_91.jpg  0.273947  0.238672  0.196548  0.125026   
 2879   bridge_108.jpg  bridge_116.jpg  0.447301  0.349201  0.247630  0.197384   
 

In [15]:
save_df_csv_into_bucket(bucket,all_selected_images_dict['highway_0.1'],'highway_epsilon_0.1.csv')

In [None]:
all_selected_images_dict['mountain_0.05']