### Import libraries and file

In [1]:
import pandas as pd
import csv
import numpy as np
from scipy.spatial.distance import squareform, pdist

In [2]:
filename = 'anpn.csv'

### Function do find the max size of columns

In [3]:
#preprocessing
def get_max_len():
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        num = []
        for i, row in enumerate(reader):
            num.append(len(row))
        m = max(num)
        #print m
        return m
    
print(get_max_len())

11


### Read and show dataset

In [4]:
df = pd.read_csv(filename, header=None, na_values=0, names=range(get_max_len()), low_memory=False)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,a1,p1,p2,,,,,,,,
1,a2,p5,p7,p9,,,,,,,
2,a3,p2,p3,p4,,,,,,,
3,a4,p7,p5,p8,p3,,,,,,
4,a5,p4,p1,p6,p9,p7,p3,,,,


### Filter Dataset

In [6]:
permissions = set()
permissions_list = set()
apps_list = set()


for index, row in df.iterrows():
    app = row[0]
    
    for perm in row[1:]:
        if str(perm) == 'nan':
            break
        else:
            #perm_list = perm.split('.')
            perm_list = perm
            perm_t = perm_list.lower()
            permissions.add(perm_t)
            permissions_list.add((app, perm_t))
            apps_list.add(app)

### Create new dataframe from filter permissions and apps

In [7]:
df_apps = pd.DataFrame(list(permissions_list))
df_apps.head()
df_apps

Unnamed: 0,0,1
0,a10,p8
1,a3,p2
2,a3,p3
3,a12,p7
4,a6,p7
5,a9,p7
6,a12,p4
7,a9,p3
8,a10,p4
9,a12,p3


### Create a matrix from dataframe

In [8]:
matrix = pd.crosstab(index=df_apps[0], columns = df_apps[1])
matrix.to_csv('permissions_matrix_fd1.csv', index=False)

### Calculate similarity matrix (>= 16gb to run)

In [9]:
similarity_matrix = np.float32(squareform(pdist(matrix, 'cosine')))

In [10]:
similarity_matrix_pearson = np.float32(squareform(pdist(matrix, 'correlation')))

### Write similarity matrix in file (.npy)

In [11]:
np.save('sim_matrix_fd1.npy', similarity_matrix)

In [12]:
np.savetxt("foo_fd1.csv", similarity_matrix, delimiter=",", fmt='%1.8f')

In [13]:
matrix

1,p1,p10,p2,p3,p4,p5,p6,p7,p8,p9
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a1,1,0,1,0,0,0,0,0,0,0
a10,1,1,1,1,1,1,1,1,1,1
a11,0,1,0,0,0,1,0,0,1,0
a12,1,0,0,1,1,0,0,1,0,1
a2,0,0,0,0,0,1,0,1,0,1
a3,0,0,1,1,1,0,0,0,0,0
a4,0,0,0,1,0,1,0,1,1,0
a5,1,0,0,1,1,0,1,1,0,1
a6,0,0,1,1,0,1,0,1,0,0
a7,1,0,0,0,0,1,0,0,0,1


In [14]:
similarity_matrix

array([[0.        , 0.5527864 , 1.        , 0.6837722 , 1.        ,
        0.5917517 , 1.        , 0.7113249 , 0.6464466 , 0.5917517 ,
        1.        , 0.73273873],
       [0.5527864 , 0.        , 0.45227745, 0.29289323, 0.45227745,
        0.45227745, 0.36754447, 0.22540332, 0.36754447, 0.45227745,
        0.5527864 , 0.16333997],
       [1.        , 0.45227745, 0.        , 1.        , 0.6666667 ,
        1.        , 0.42264974, 1.        , 0.7113249 , 0.6666667 ,
        1.        , 0.56356424],
       [0.6837722 , 0.29289323, 1.        , 0.        , 0.48360223,
        0.48360223, 0.5527864 , 0.08712907, 0.5527864 , 0.48360223,
        0.6837722 , 0.3238766 ],
       [1.        , 0.45227745, 0.6666667 , 0.48360223, 0.        ,
        1.        , 0.42264974, 0.5285955 , 0.42264974, 0.33333334,
        1.        , 0.34534633],
       [0.5917517 , 0.45227745, 1.        , 0.48360223, 1.        ,
        0.        , 0.7113249 , 0.5285955 , 0.42264974, 1.        ,
        0.5917517 ,

In [15]:
similarity_matrix_pearson

array([[0.        ,        nan, 1.3273269 , 1.        , 1.3273269 ,
        0.7817821 , 1.4082483 , 1.1020621 , 0.89793795, 0.7817821 ,
        1.25      , 1.2182178 ],
       [       nan, 0.        ,        nan,        nan,        nan,
               nan,        nan,        nan,        nan,        nan,
               nan,        nan],
       [1.3273269 ,        nan, 0.        , 1.6546537 , 0.95238096,
        1.4285715 , 0.64365166, 1.8017837 , 1.0890871 , 0.95238096,
        1.3273269 , 1.0476191 ],
       [1.        ,        nan, 1.6546537 , 0.        , 0.7817821 ,
        0.7817821 , 1.        , 0.18350342, 1.        , 0.7817821 ,
        1.        , 0.7817821 ],
       [1.3273269 ,        nan, 0.95238096, 0.7817821 , 0.        ,
        1.4285715 , 0.64365166, 0.91091293, 0.64365166, 0.47619048,
        1.3273269 , 0.5714286 ],
       [0.7817821 ,        nan, 1.4285715 , 0.7817821 , 1.4285715 ,
        0.        , 1.0890871 , 0.91091293, 0.64365166, 1.4285715 ,
        0.7817821 ,