In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from os import listdir
import json

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('./drive', force_remount=True)

path_prefix = './drive/My Drive/'

Mounted at ./drive


In [4]:
filename = "Google-Playstore.csv"
df = pd.read_csv(join(path_prefix, filename))

In [5]:
play_store = df.nlargest(10000, 'Maximum Installs')

In [6]:
play_store.shape

(10000, 24)

In [7]:
play_store.dtypes

App Name              object
App Id                object
Category              object
Rating               float64
Rating Count         float64
Installs              object
Minimum Installs     float64
Maximum Installs       int64
Free                    bool
Price                float64
Currency              object
Size                  object
Minimum Android       object
Developer Id          object
Developer Website     object
Developer Email       object
Released              object
Last Updated          object
Content Rating        object
Privacy Policy        object
Ad Supported            bool
In App Purchases        bool
Editors Choice          bool
Scraped Time          object
dtype: object

In [None]:
play_store.head(20)

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
2155096,Google Play services,com.google.android.gms,Tools,4.2,35128398.0,"10,000,000,000+",10000000000.0,12057627016,True,0.0,...,https://developers.google.com/android/google-p...,apps-help@google.com,"May 24, 2012","Jun 10, 2021",Everyone,http://www.google.com/policies/privacy,False,False,False,2021-06-16 10:41:26
881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,True,0.0,...,https://support.google.com/youtube/topic/24225...,ytandroid-support@google.com,"Oct 20, 2010","Jun 16, 2021",Teen,http://www.google.com/policies/privacy,True,False,False,2021-06-16 10:28:37
1773294,Google,com.google.android.googlequicksearchbox,Tools,4.2,19798962.0,"5,000,000,000+",5000000000.0,9154248491,True,0.0,...,https://www.google.com/search/about/,apps-help@google.com,"Aug 12, 2010","Jun 11, 2021",Everyone,http://www.google.com/policies/privacy,True,False,False,2021-06-16 05:12:00
1733042,Google Maps - Navigate & Explore,com.google.android.apps.maps,Travel & Local,3.8,13576045.0,"5,000,000,000+",5000000000.0,9141671889,True,0.0,...,http://maps.google.com/about/,apps-help@google.com,,"Jun 15, 2021",Everyone,http://www.google.com/policies/privacy,True,False,True,2021-06-16 04:38:02
1060335,Google Text-to-Speech,com.google.android.tts,Tools,4.1,254700.0,"5,000,000,000+",5000000000.0,9034404884,True,0.0,...,http://www.google.com/,tts-feedback@google.com,"Oct 10, 2013","Mar 29, 2021",Everyone,http://www.google.com/policies/privacy,False,False,False,2021-06-16 13:16:56
944254,Google Chrome: Fast & Secure,com.android.chrome,Communication,4.1,31481796.0,"5,000,000,000+",5000000000.0,8925640788,True,0.0,...,http://www.google.com/chrome/android,apps-help@google.com,"Feb 7, 2012","Jun 14, 2021",Everyone,http://www.google.com/chrome/intl/en/privacy.html,False,False,False,2021-06-16 11:26:31
2099458,Gmail,com.google.android.gm,Communication,4.2,9488421.0,"5,000,000,000+",5000000000.0,8756574289,True,0.0,...,http://support.google.com/mail/bin/topic.py?hl...,apps-help@google.com,"Sep 21, 2010","Jun 16, 2021",Everyone,http://www.google.com/policies/privacy,True,False,False,2021-06-16 09:53:00
2011395,Android Accessibility Suite,com.google.android.marvin.talkback,Tools,4.2,2666450.0,"5,000,000,000+",5000000000.0,7408134567,True,0.0,...,https://developer.android.com/guide/topics/ui/...,apps-help@google.com,,"May 19, 2021",Everyone,http://www.google.com/policies/privacy,False,False,False,2021-06-16 08:36:32
893676,Google Drive,com.google.android.apps.docs,Productivity,4.3,639307.0,"5,000,000,000+",5000000000.0,7028265259,True,0.0,...,https://support.google.com/drive/?p=android_dr...,apps-help@google.com,"Apr 27, 2011","Jun 09, 2021",Everyone,http://www.google.com/policies/privacy,False,True,False,2021-06-16 10:39:49
2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,True,0.0,...,https://www.facebook.com/facebook,facebook.android@fb.com,,"Jun 16, 2021",Teen,https://www.facebook.com/about/privacy/,True,True,False,2021-06-16 11:40:20


In [6]:
# Upload Permissions Dataset
filename = "permissions_dataset.csv"
permissions = pd.read_csv(join(path_prefix, filename))

In [9]:
permissions.shape

(2292693, 3)

In [10]:
permissions.dtypes

appId             object
appName           object
allPermissions    object
dtype: object

In [7]:
permissions.rename(columns = {'appId':'App Id', 'allPermissions':'All Permissions'}, inplace = True)

In [8]:
permissions.drop(columns=['appName'], inplace=True)

In [13]:
permissions.sample(5)

Unnamed: 0,App Id,All Permissions
1296667,com.br.latestdontfall,"[{'permission': 'view network connections', 't..."
266917,eu.hella,[{'permission': 'approximate location (network...
1946830,com.promedia.pictoeduca,[{'permission': 'read phone status and identit...
2256167,blueique.studios.KidsHeadsUp,[]
212872,org.metro.ridemetro,[{'permission': 'approximate location (network...


In [9]:
# Merge two datasets
data_df = pd.merge(permissions, play_store, on='App Id')

In [10]:
data_df.shape

(9990, 25)

In [16]:
data_df.head()

Unnamed: 0,App Id,All Permissions,App Name,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
0,com.google.android.apps.messaging,[{'permission': 'modify or delete the contents...,Messages,Communication,4.4,5305758.0,"1,000,000,000+",1000000000.0,1931517750,True,...,http://www.google.com/mobile/android/,apps-help@google.com,"Nov 12, 2014","Jun 10, 2021",Everyone,http://www.google.com/policies/privacy,False,False,False,2021-06-16 06:11:32
1,com.alp.traingamesfreesimulator,"[{'permission': 'view network connections', 't...",Train Games Simulator : Indian Train Driving G...,Weather,4.1,21189.0,"5,000,000+",5000000.0,9632078,True,...,https://www.bajakestudios.com,info.2bitventures@gmail.com,"Feb 16, 2017","May 16, 2021",Everyone,http://bajakestudios.com/privacy-policy/,True,True,False,2021-06-16 03:41:11
2,com.rapido.passenger,"[{'permission': 'retrieve running apps', 'type...",Rapido - Bike Taxi,Maps & Navigation,4.1,701278.0,"10,000,000+",10000000.0,23238658,True,...,https://www.rapido.bike,shoutout@rapido.bike,"Sep 30, 2015","Jun 12, 2021",Everyone,https://rapido.bike/privacy.html,False,False,False,2021-06-16 06:16:28
3,de.aktiwir.aktibmi,"[{'permission': 'find accounts on the device',...",Weight Loss Tracker & BMI - aktiBMI,Health & Fitness,4.5,4798.0,"5,000,000+",5000000.0,9767859,True,...,http://www.aktibmi.com,aktiwir@gmail.com,"May 30, 2015","Jun 09, 2021",Everyone,http://www.aktibmi.com/cms/privacy-policy-akti...,True,True,False,2021-06-16 06:03:47
4,com.gamevil.kritikamobile.android.google.globa...,"[{'permission': 'retrieve running apps', 'type...",Kritika: The White Knights,Role Playing,4.3,880658.0,"10,000,000+",10000000.0,25135365,True,...,http://www.withhive.com,help@gamevil.com,"May 29, 2014","Jun 02, 2021",Teen,http://terms.withhive.com/terms/bridge/gamevil...,True,True,True,2021-06-16 03:33:33


In [11]:
for i,row in data_df.iterrows(): 
  perms=row["All Permissions"]
  if type(perms)==str:

    data_df.at[i,"All Permissions"]=perms


In [24]:
import re
for i,row in data_df.iterrows(): 
  perms=row["All Permissions"]
  listofdict=[]
  if type(perms)==str:
    #change key's surrounding '' to ""
    for idx in range(len(perms)):
      char=perms[idx]
      if char=="'" and perms[idx+1]==":":
        idx2=perms.rfind("'",0,idx-1)
        temp=list(perms)
        temp[idx]='"'
        temp[idx2]='"'
        perms=''.join(temp)

    for idx in range(len(perms)):
      char=perms[idx]
      if char=="{":
        idx2=perms.find("}",idx)
        dict_obj=perms[idx+1:idx2]
        cols=[]
        for i2 in range(len(dict_obj)):
          for ch in re.finditer(":",dict_obj):
            idcol=ch.start()
            cols.append(idcol)
        #dict_obj is a string like this now: {"permission": 'retrieve running apps', "type": 'Device & app history'}

        v1=dict_obj.find(",",cols[0])
        val1=dict_obj[cols[0]+3:v1-1]
        val2=dict_obj[cols[1]+3:-1]

        item={
            "permission":val1,
            "type":val2
        }
        listofdict.append(item)     
    data_df.at[i,"All Permissions"]=listofdict

In [40]:
import networkx as nx
import json

permNet = nx.Graph()


for name in data_df['App Id']:
  permNet.add_node(name)

print(permNet.number_of_nodes())
print(permNet)

9990
Graph with 9990 nodes and 0 edges


In [41]:
for id1 in permNet.nodes:
  for id2 in permNet.nodes:
    if id1!=id2:
      row1=data_df[data_df["App Id"]==id1]
      row2=data_df[data_df["App Id"]==id2]

      perms1=list(row1["All Permissions"])[0]
      perms2=list(row2["All Permissions"])[0]

      common_perms=0
      for per1 in perms1:
        for per2 in perms2:
          if per1["permission"]==per2["permission"]:
            common_perms+=1
      if common_perms>0:
        permNet.add_edge(id1,id2,weight=common_perms)
    

KeyboardInterrupt: ignored

In [None]:
def nodePropertiesMethod(network):
  nodeProperties = dict()

  for k,v in dict(network.out_degree()).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['outdegree'] = v

  for k,v in dict(network.in_degree()).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['indegree'] = v

  for k,v in dict(nx.eigenvector_centrality(network)).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['eigvec-centrality'] = v

  for k,v in dict(nx.betweenness_centrality(network)).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['betweenness-centrality'] = v

  for k,v in dict(nx.closeness_centrality(network)).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['closeness-centrality'] = v

  for k,v in dict(nx.clustering(network)).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['clustering'] = v

  for k,v in dict(nx.average_neighbor_degree(network)).items():
    if k not in nodeProperties:
      nodeProperties[k] = dict()
    nodeProperties[k]['avg.neighbor deg.'] = v

  nodePropertiesDf = pd.DataFrame.from_dict(nodeProperties, orient='index')
  return nodePropertiesDf

permsDf=nodePropertiesMethod(permNet)
permsDf

In [None]:
def stats_of_df(df,name):

  for col in df.columns:

    stat=df.sort_values(by=[col],ascending=False)
    print("{} with the highest {}:".format(name,col),stat.head(1)[col],"\n")

stats_of_df(permsDf,"App")

In [None]:
nx.draw(permNet,node_size=15,node_shape="o",node_color="#1DA1F2")

In [None]:
def graphProperties(G, name): #a function to calculate the graph properties

  print("graph properties for",name,"\n")

  G_dir=G
  G=nx.to_undirected(G)

  nodes=nx.number_of_nodes(G)
  edges=nx.number_of_edges(G)

  print("number of nodes in {} is".format(name),nodes)
  print("number of edges in {} is".format(name),edges,"\n")

  if nx.is_connected(G):
    print("largest connected component is the graph itself, having the following nodes:")
    largest_cc = max(nx.connected_components(G), key=len)
    print(largest_cc,"\n")
    print("radius of the {} graph is:".format(name),nx.radius(G))
    print("diameter of the {} graph is:".format(name),nx.diameter(G),"\n")
  else:
      diams = []
      components  = (G.subgraph(c).copy() for c in nx.connected_components(G))
      for c in components:
          diams.append(nx.diameter(c))
      rads = []

      print("Connected subgraphs in graph:")
      for c in components:
        print(c)
        rads.append(nx.radius(c))
      largest_cc = max(nx.connected_components(G), key=len)
      print(largest_cc)
      print("maximum diameter among the connected subgraphs of {}:".format(name),max(diams))
      print("maximum radius among the connected subgraphs of {}:".format(name),max(rads),"\n")
  
  cliques=nx.graph_number_of_cliques(G)

  density = G.number_of_edges()/(G.number_of_nodes()*(G.number_of_nodes()-1))
  print("density of {} is:".format(name),density,"\n")
  print("number of cliques in {} is:".format(name),cliques,"\n")

  if nx.is_strongly_connected(G_dir):
    print("number of strongly connected components in {} is".format(name),nx.number_strongly_connected_components(G_dir))
    strongly_connected=[ (c,len(c)) for c in sorted(nx.strongly_connected_components(G_dir), key=len, reverse=True)]
  else:
    print("{} is not strongly connected.".format(name))
  print("---------------------------------------------------------------")

In [None]:
graphProperties(permNet,"permission network")