### Import libraries and file

In [1]:
import pandas as pd
import csv
import numpy as np
from scipy.spatial.distance import squareform, pdist

In [2]:
filename = 'anpn2.csv'

### Function do find the max size of columns

In [3]:
#preprocessing
def get_max_len():
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        num = []
        for i, row in enumerate(reader):
            num.append(len(row))
        m = max(num)
        #print m
        return m
    
print(get_max_len())

11


### Read and show dataset

In [4]:
df = pd.read_csv(filename, header=None, na_values=0, names=range(get_max_len()), low_memory=False)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,a1,p1,p2,,,,,,,,
1,a2,p5,p7,p9,,,,,,,
2,a3,p2,p3,p4,,,,,,,
3,a4,p7,p5,p8,p3,,,,,,
4,a5,p4,p1,p6,p9,p7,p3,,,,


In [7]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,a1,p1,p2,,,,,,,,
1,a2,p5,p7,p9,,,,,,,
2,a3,p2,p3,p4,,,,,,,
3,a4,p7,p5,p8,p3,,,,,,
4,a5,p4,p1,p6,p9,p7,p3,,,,
5,a6,p7,p3,p5,p2,,,,,,
6,a7,p1,p5,p9,,,,,,,
7,a8,p6,p3,,,,,,,,
8,a9,p1,p3,p5,p6,p7,p8,p9,,,
9,a10,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10


### Filter Dataset

In [8]:
permissions = set()
permissions_list = set()
apps_list = set()


for index, row in df.iterrows():
    app = row[0]
    
    for perm in row[1:]:
        if str(perm) == 'nan':
            break
        else:
            #perm_list = perm.split('.')
            perm_list = perm
            perm_t = perm_list.lower()
            permissions.add(perm_t)
            permissions_list.add((app, perm_t))
            apps_list.add(app)

### Create new dataframe from filter permissions and apps

In [9]:
df_apps = pd.DataFrame(list(permissions_list))
df_apps.head()
df_apps

Unnamed: 0,0,1
0,a9,p8
1,a4,p7
2,a5,p9
3,a6,p7
4,a3,p4
5,a7,p1
6,a9,p3
7,a9,p5
8,a17,p4
9,a18,p2


### Create a matrix from dataframe

In [10]:
matrix = pd.crosstab(index=df_apps[0], columns = df_apps[1])
matrix.to_csv('permissions_matrix_anpn2.csv', index=False)

### Calculate similarity matrix (>= 16gb to run)

In [11]:
similarity_matrix = np.float32(squareform(pdist(matrix, 'cosine')))

In [12]:
similarity_matrix_pearson = np.float32(squareform(pdist(matrix, 'correlation')))

### Write similarity matrix in file (.npy)

In [33]:
np.save('sim_matrix_anpn2.npy', similarity_matrix)

In [13]:
np.savetxt("sim_matriz_anpn2.csv", similarity_matrix, delimiter=",")

In [14]:
matrix

1,p1,p10,p2,p3,p4,p5,p6,p7,p8,p9
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a1,1,0,1,0,0,0,0,0,0,0
a10,1,1,1,1,1,1,1,1,1,1
a11,0,1,0,0,0,1,0,0,1,0
a12,1,0,0,1,1,0,0,1,0,1
a13,1,0,1,1,0,0,0,0,0,0
a14,0,0,0,0,1,1,1,0,0,0
a15,0,1,0,0,0,0,0,1,1,1
a16,1,0,0,1,0,1,0,1,0,0
a17,0,0,1,0,1,0,1,0,1,0
a18,1,1,1,0,0,0,1,0,0,1


In [15]:
similarity_matrix

array([[0.        , 0.5527864 , 1.        , 0.6837722 , 0.18350342,
        1.        , 1.        , 0.6464466 , 0.6464466 , 0.36754447,
        0.46547753, 1.        , 1.        , 0.5917517 , 1.        ,
        0.7113249 , 0.6464466 , 0.5917517 , 1.        , 0.73273873],
       [0.5527864 , 0.        , 0.45227745, 0.29289323, 0.45227745,
        0.45227745, 0.36754447, 0.36754447, 0.36754447, 0.29289323,
        0.16333997, 0.45227745, 0.6837722 , 0.45227745, 0.36754447,
        0.22540332, 0.36754447, 0.45227745, 0.5527864 , 0.16333997],
       [1.        , 0.45227745, 0.        , 1.        , 1.        ,
        0.6666667 , 0.42264974, 0.7113249 , 0.7113249 , 0.7418011 ,
        0.34534633, 0.6666667 , 1.        , 1.        , 0.42264974,
        1.        , 0.7113249 , 0.6666667 , 1.        , 0.56356424],
       [0.6837722 , 0.29289323, 1.        , 0.        , 0.48360223,
        0.7418011 , 0.5527864 , 0.3291796 , 0.7763932 , 0.6       ,
        0.6619383 , 0.48360223, 0.5527864 , 0

In [16]:
similarity_matrix_pearson

array([[0.        ,        nan, 1.3273269 , 1.        , 0.23623738,
        1.3273269 , 1.4082483 , 0.89793795, 0.89793795, 0.5       ,
        0.67267317, 1.3273269 , 1.1666666 , 0.7817821 , 1.4082483 ,
        1.1020621 , 0.89793795, 0.7817821 , 1.25      , 1.2182178 ],
       [       nan, 0.        ,        nan,        nan,        nan,
               nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan,        nan],
       [1.3273269 ,        nan, 0.        , 1.6546537 , 1.4285715 ,
        0.95238096, 0.64365166, 1.0890871 , 1.0890871 , 1.2182178 ,
        0.5714286 , 0.95238096, 1.2182178 , 1.4285715 , 0.64365166,
        1.8017837 , 1.0890871 , 0.95238096, 1.3273269 , 1.0476191 ],
       [1.        ,        nan, 1.6546537 , 0.        , 0.7817821 ,
        1.2182178 , 1.        , 0.5917517 , 1.4082483 , 1.2       ,
        1.6546537 , 0.7817821 , 0.6666667 , 0