### Import libraries and file

In [29]:
import pandas as pd
import csv
import numpy as np
import warnings
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import complete, fcluster
from scipy.cluster.hierarchy import linkage
import seaborn as sns
from kmedoids import kmedoids

In [30]:
#increase width to 100% in Jupyter Notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [31]:
filename = 'anpn2.csv'

### Function do find the max size of columns

In [32]:
#preprocessing
def get_max_len():
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        num = []
        for i, row in enumerate(reader):
            num.append(len(row))
        m = max(num)
        #print m
        return m
    
print(get_max_len())

11


### Read and show dataset

In [33]:
df = pd.read_csv(filename, header=None, na_values=0, names=range(get_max_len()), low_memory=False)

In [34]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,a1,p1,p2,,,,,,,,
1,a2,p5,p7,p9,,,,,,,
2,a3,p2,p3,p4,,,,,,,
3,a4,p7,p5,p8,p3,,,,,,
4,a5,p4,p1,p6,p9,p7,p3,,,,


In [35]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,a1,p1,p2,,,,,,,,
1,a2,p5,p7,p9,,,,,,,
2,a3,p2,p3,p4,,,,,,,
3,a4,p7,p5,p8,p3,,,,,,
4,a5,p4,p1,p6,p9,p7,p3,,,,
5,a6,p7,p3,p5,p2,,,,,,
6,a7,p1,p5,p9,,,,,,,
7,a8,p6,p3,,,,,,,,
8,a9,p1,p3,p5,p6,p7,p8,p9,,,
9,a10,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10


### Filter Dataset

In [36]:
permissions = set()
permissions_list = set()
apps_list = set()


for index, row in df.iterrows():
    app = row[0]
    
    for perm in row[1:]:
        if str(perm) == 'nan':
            break
        else:
            #perm_list = perm.split('.')
            perm_list = perm
            perm_t = perm_list.lower()
            permissions.add(perm_t)
            permissions_list.add((app, perm_t))
            apps_list.add(app)

### Create new dataframe from filter permissions and apps

In [37]:
df_apps = pd.DataFrame(list(permissions_list))
df_apps.head()
df_apps

Unnamed: 0,0,1
0,a4,p7
1,a5,p4
2,a9,p9
3,a15,p9
4,a11,p10
5,a16,p3
6,a2,p9
7,a7,p5
8,a15,p10
9,a2,p7


### Create a matrix from dataframe

In [38]:
matrix = pd.crosstab(index=df_apps[0], columns = df_apps[1])
#matrix.to_csv('permissions_matrix_fd1.csv', index=False)

### Calculate similarity matrix (>= 16gb to run)

In [39]:
similarity_matrix = np.float32(squareform(pdist(matrix, 'cosine')))

### Write similarity matrix in file (.npy)

In [40]:
#np.save('sim_matrix_fd1.npy', similarity_matrix)

In [41]:
#np.savetxt("foo_fd1.csv", similarity_matrix, delimiter=",", fmt='%1.8f')

In [42]:
del matrix.index.name

In [43]:
del matrix.columns.name

In [44]:
matrix.index

Index(['a1', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18',
       'a19', 'a2', 'a20', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9'],
      dtype='object')

In [45]:
#matrix.reindex(['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8',
#       'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18',
#       'a19', 'a20'])


In [46]:
matrix.reindex(['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8',
                'a9', 'a10', 'a11', 'a12'])

Unnamed: 0,p1,p10,p2,p3,p4,p5,p6,p7,p8,p9
a1,1,0,1,0,0,0,0,0,0,0
a2,0,0,0,0,0,1,0,1,0,1
a3,0,0,1,1,1,0,0,0,0,0
a4,0,0,0,1,0,1,0,1,1,0
a5,1,0,0,1,1,0,1,1,0,1
a6,0,0,1,1,0,1,0,1,0,0
a7,1,0,0,0,0,1,0,0,0,1
a8,0,0,0,1,0,0,1,0,0,0
a9,1,0,0,1,0,1,1,1,1,1
a10,1,1,1,1,1,1,1,1,1,1


In [47]:
matrix_cosine_distance = pd.DataFrame(similarity_matrix)

In [48]:
matrix_cosine_distance

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.552786,1.0,0.683772,0.183503,1.0,1.0,0.646447,0.646447,0.367544,0.465478,1.0,1.0,0.591752,1.0,0.711325,0.646447,0.591752,1.0,0.732739
1,0.552786,0.0,0.452277,0.292893,0.452277,0.452277,0.367544,0.367544,0.367544,0.292893,0.16334,0.452277,0.683772,0.452277,0.367544,0.225403,0.367544,0.452277,0.552786,0.16334
2,1.0,0.452277,0.0,1.0,1.0,0.666667,0.42265,0.711325,0.711325,0.741801,0.345346,0.666667,1.0,1.0,0.42265,1.0,0.711325,0.666667,1.0,0.563564
3,0.683772,0.292893,1.0,0.0,0.483602,0.741801,0.552786,0.32918,0.776393,0.6,0.661938,0.483602,0.552786,0.483602,0.552786,0.087129,0.552786,0.483602,0.683772,0.323877
4,0.183503,0.452277,1.0,0.483602,0.0,1.0,1.0,0.42265,0.711325,0.483602,0.563564,1.0,0.42265,0.333333,0.711325,0.528596,0.42265,0.666667,0.591752,0.563564
5,1.0,0.452277,0.666667,0.741801,1.0,0.0,1.0,0.711325,0.42265,0.741801,0.563564,0.666667,1.0,0.666667,0.711325,0.528596,0.711325,0.666667,0.591752,0.563564
6,1.0,0.367544,0.42265,0.552786,1.0,1.0,0.0,0.75,0.75,0.552786,0.433053,0.42265,1.0,1.0,0.5,0.591752,0.75,0.711325,1.0,0.433053
7,0.646447,0.367544,0.711325,0.32918,0.42265,0.711325,0.75,0.0,1.0,0.776393,0.622036,0.42265,0.5,0.711325,0.25,0.387628,0.25,0.42265,0.646447,0.244071
8,0.646447,0.367544,0.711325,0.776393,0.711325,0.42265,0.75,1.0,0.0,0.552786,0.433053,1.0,1.0,0.42265,0.75,0.591752,0.75,1.0,0.646447,0.622036
9,0.367544,0.292893,0.741801,0.6,0.483602,0.741801,0.552786,0.776393,0.552786,0.0,0.154846,0.741801,1.0,0.741801,1.0,0.452277,0.776393,0.483602,0.683772,0.492907


In [49]:
medoid={}
cluster={}
for i in range(2,20):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        sm, c = kmedoids(similarity_matrix, k=i)
        medoid[i] = sm
        cluster[i] = c

In [50]:
np.save('sample_model_20x10_medoid.npy', medoid)
# medoid.to_csv('sample_model_20x10_medoid.csv', index=False)

In [51]:
np.save('sample_model_20x10_cluster.npy', cluster)
# cluster.to_csv('sample_model_20x10_cluster.csv', index=False)