In [59]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from urllib.parse import urlparse
import math
import json
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('1000_labeled_and_features.csv')

In [3]:
data.columns

Index(['appId', 'true label', 'category', 'id', 'title', 'summary', 'icon',
       'price', 'free', 'minInstalls', 'maxInstalls', 'score', 'reviews',
       'developer', 'developerId', 'developerEmail', 'developerWebsite',
       'updated', 'version', 'genre', 'genreId', 'familyGenre',
       'familyGenreId', 'size', 'description', 'descriptionHTML', 'histogram',
       'offersIAP', 'adSupported', 'androidVersionText', 'androidVersion',
       'contentRating', 'screenshots', 'video', 'comments', 'recentChanges',
       'preregister', 'url', 'appId.1', 'similar', 'permissions', 'time',
       'lastseen', 'discontinued', 'installs', 'scoreText', 'ratings',
       'currency', 'priceText', 'developerAddress', 'privacyPolicy',
       'headerImage', 'videoImage', 'contentRatingDescription', 'released'],
      dtype='object')

In [4]:
data = data.head(1091)

In [5]:
data = data.drop(['id', 'price', 'free', 'minInstalls', 'maxInstalls', 'developer', 'developerId', 'developerEmail', 'genreId', 
'familyGenre', 'familyGenreId', 'size', 'histogram', 'offersIAP', 'adSupported', 'androidVersionText','preregister', 
'time', 'lastseen', 'currency', 'priceText', 'developerAddress', 'contentRatingDescription', 'released',
'updated', 'version', 'descriptionHTML', 'video', 'installs', 'scoreText', 'ratings', 'videoImage', 'appId.1', 
'androidVersion'], axis =1)
           

In [6]:
data.head()

Unnamed: 0,appId,true label,category,title,summary,icon,score,reviews,developerWebsite,genre,...,contentRating,screenshots,comments,recentChanges,url,similar,permissions,discontinued,privacyPolicy,headerImage
0,a2ndappwhats.sdkw.com,surveillance:social-media,surveillance,2nd Account for Whatsapp,2 Whatsapps on Same Device | Same Whatsapps on...,//lh3.googleusercontent.com/nCOacYEXACIuhJWb9J...,4.0,6939.0,http://jh-jewelry.in/sureshkheni.html,Communication,...,Teen,"[""//lh3.googleusercontent.com/e32FVC6gtcvUj1B1...","[""Very good applications but not suppose to al...","[""Bug Fixes""]",https://play.google.com/store/apps/details?id=...,"[""com.lbe.parallel.intl"", ""com.app.tiki.multim...","[""read the contents of your USB storage"", ""mod...",20171111:1135,,
1,air.au.com.metro.DumbWaysToDie2,none:misc,none,Dumb Ways to Die 2: The Games,A new set of dumb characters are here to take ...,//lh3.googleusercontent.com/-ckfFIgLg_qkKbVVgv...,4.2,1584158.0,http://dumbwaystodie.com,Casual,...,Teen,"[""//lh3.googleusercontent.com/G_-nQwn58EEoZbof...","[""It's a good game, and there's variety, but t...","[""AMERICALAND"", ""We\u2019re celebrating the Fo...",https://play.google.com/store/apps/details?id=...,"[""com.popreach.dumbways"", ""com.zynga.looney"", ...","[""find accounts on the device"", ""read the cont...",,,
2,air.com.applauz.timeoutkids,control:use-limitation,control,Time Out - Behaviour Meter,Live Behaviour Meter companion app for &quot;T...,https://lh3.googleusercontent.com/uRLr8icEKEhV...,4.5,0.0,http://www.applauz-media.com/timeout,Parenting,...,Everyone,"[""https://lh3.googleusercontent.com/Fon1L3syRL...",[],,https://play.google.com/store/apps/details?id=...,"[""com.jeesmon.malayalambible"", ""com.bn.speacki...","[""full network access""]",,http://applauz-media.com/privacypolicy,https://lh3.googleusercontent.com/3_G29zRlpVOA...
3,ajx.com.calltracker,callerid:misc,callerid,Call Tracker,Call Tracker helps you identify numbers real-t...,//lh3.googleusercontent.com/RQi_G-l3CadNwi4PNc...,3.9,71.0,http://www.optcrm.com/privacypolicy.html,Tools,...,Everyone,"[""//lh3.googleusercontent.com/kIVoX7kgHvKld7LP...","[""Thanks"", ""Chutiya App and which people has c...","[""Improved search result""]",https://play.google.com/store/apps/details?id=...,"[""com.truecaller"", ""com.caller.id.location.gps...","[""read your contacts"", ""modify your contacts"",...",,,
4,allcall.location.tracker,callerid:location,callerid,All Call Location Tracker,All Calls Location Tracker shows the location ...,//lh4.ggpht.com/GJHw8FTW0PqpDnLoHDJvg1MIQHZwjV...,4.0,3406.0,,Communication,...,Everyone,"[""//lh3.googleusercontent.com/UZ80Bvz4FwbnfU4I...","[""Very Slow and looks like a spam :-("", ""Too m...",[],https://play.google.com/store/apps/details?id=...,"[""com.caller.id.location.gps.maps.phone.number...","[""read sensitive log data"", ""read your Web boo...",20171214:1228,,


In [7]:
data['discontinued'] = data['discontinued'].notnull().astype('int')
data['privacyPolicy'] = data['privacyPolicy'].notnull().astype('int')

In [8]:
data.columns

Index(['appId', 'true label', 'category', 'title', 'summary', 'icon', 'score',
       'reviews', 'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'discontinued', 'privacyPolicy', 'headerImage'],
      dtype='object')

In [9]:
discontinued = pd.get_dummies(data['discontinued'])
privacy = pd.get_dummies(data['privacyPolicy'])

In [10]:
privacy.columns = ['not_privacy', 'privacy']
discontinued.columns = ['not_discontinued', 'discontinued']

In [11]:
discontinued.head()

Unnamed: 0,not_discontinued,discontinued
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [12]:
data = data.drop(['discontinued', 'privacyPolicy'], axis  = 1)

In [13]:
data = pd.concat([data, privacy, discontinued], axis = 1)

In [14]:
data.columns

Index(['appId', 'true label', 'category', 'title', 'summary', 'icon', 'score',
       'reviews', 'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'headerImage', 'not_privacy', 'privacy',
       'not_discontinued', 'discontinued'],
      dtype='object')

In [15]:
labels = data[['true label', 'category']]

In [16]:
labels.head()

Unnamed: 0,true label,category
0,surveillance:social-media,surveillance
1,none:misc,none
2,control:use-limitation,control
3,callerid:misc,callerid
4,callerid:location,callerid


In [17]:
data = data.drop(['true label', 'category'], axis = 1)

In [18]:
data = data.fillna(0)

In [19]:
data.head()

Unnamed: 0,appId,title,summary,icon,score,reviews,developerWebsite,genre,description,contentRating,...,comments,recentChanges,url,similar,permissions,headerImage,not_privacy,privacy,not_discontinued,discontinued
0,a2ndappwhats.sdkw.com,2nd Account for Whatsapp,2 Whatsapps on Same Device | Same Whatsapps on...,//lh3.googleusercontent.com/nCOacYEXACIuhJWb9J...,4.0,6939.0,http://jh-jewelry.in/sureshkheni.html,Communication,use & Control another WhatsApp with the same d...,Teen,...,"[""Very good applications but not suppose to al...","[""Bug Fixes""]",https://play.google.com/store/apps/details?id=...,"[""com.lbe.parallel.intl"", ""com.app.tiki.multim...","[""read the contents of your USB storage"", ""mod...",0,1,0,0,1
1,air.au.com.metro.DumbWaysToDie2,Dumb Ways to Die 2: The Games,A new set of dumb characters are here to take ...,//lh3.googleusercontent.com/-ckfFIgLg_qkKbVVgv...,4.2,1584158.0,http://dumbwaystodie.com,Casual,There’s a whole new set of dumb characters tha...,Teen,...,"[""It's a good game, and there's variety, but t...","[""AMERICALAND"", ""We\u2019re celebrating the Fo...",https://play.google.com/store/apps/details?id=...,"[""com.popreach.dumbways"", ""com.zynga.looney"", ...","[""find accounts on the device"", ""read the cont...",0,1,0,1,0
2,air.com.applauz.timeoutkids,Time Out - Behaviour Meter,Live Behaviour Meter companion app for &quot;T...,https://lh3.googleusercontent.com/uRLr8icEKEhV...,4.5,0.0,http://www.applauz-media.com/timeout,Parenting,"You already have our app ""Time Out - Time Out ...",Everyone,...,[],0,https://play.google.com/store/apps/details?id=...,"[""com.jeesmon.malayalambible"", ""com.bn.speacki...","[""full network access""]",https://lh3.googleusercontent.com/3_G29zRlpVOA...,0,1,1,0
3,ajx.com.calltracker,Call Tracker,Call Tracker helps you identify numbers real-t...,//lh3.googleusercontent.com/RQi_G-l3CadNwi4PNc...,3.9,71.0,http://www.optcrm.com/privacypolicy.html,Tools,Call Tracker caller ID helps you identify numb...,Everyone,...,"[""Thanks"", ""Chutiya App and which people has c...","[""Improved search result""]",https://play.google.com/store/apps/details?id=...,"[""com.truecaller"", ""com.caller.id.location.gps...","[""read your contacts"", ""modify your contacts"",...",0,1,0,1,0
4,allcall.location.tracker,All Call Location Tracker,All Calls Location Tracker shows the location ...,//lh4.ggpht.com/GJHw8FTW0PqpDnLoHDJvg1MIQHZwjV...,4.0,3406.0,0,Communication,All Calls Location Tracker shows the location ...,Everyone,...,"[""Very Slow and looks like a spam :-("", ""Too m...",[],https://play.google.com/store/apps/details?id=...,"[""com.caller.id.location.gps.maps.phone.number...","[""read sensitive log data"", ""read your Web boo...",0,1,0,0,1


In [20]:
data.columns

Index(['appId', 'title', 'summary', 'icon', 'score', 'reviews',
       'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'headerImage', 'not_privacy', 'privacy',
       'not_discontinued', 'discontinued'],
      dtype='object')

In [21]:
text = []
for el in data.values:
    all_string = el[1] + ' ' + el[2] + ' ' + el[7] + ' ' + el[8] + ' ' + el[9]
    if type(el[11]) == type('a'):
        try:
            for comment in json.loads(el[11]):
                all_string += ' ' + comment
        except:
            all_string += ' ' + el[11]
    if type(el[12]) == type('a'):
        try:
            for change in json.loads(el[12]):
                all_string += ' ' + change
        except:
            all_string+= ' ' + el[12]
    if type(el[15]) == type('a'):
        try:
            for permission in json.loads(el[15]):
                all_string += ' ' + permission
        except:
            all_string += ' ' + el[15]
    text.append(all_string.encode('utf-8'))

In [22]:
data = data.drop(['title', 'summary', 'description', 'contentRating', 'comments', 'recentChanges', 'permissions'], axis = 1)

In [23]:
text = pd.DataFrame(text, columns = ['text'])

In [24]:
data = pd.concat([data, text], axis = 1)

In [25]:
data = data.drop(['appId', 'icon', 'reviews', 'developerWebsite', 'genre', 'screenshots', 'url', 'similar', 'headerImage'], axis = 1)

In [26]:
data = pd.concat([data, labels], axis = 1)

In [27]:
data.head()

Unnamed: 0,score,not_privacy,privacy,not_discontinued,discontinued,text,true label,category
0,4.0,1,0,0,1,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance:social-media,surveillance
1,4.2,1,0,1,0,"b""Dumb Ways to Die 2: The Games A new set of d...",none:misc,none
2,4.5,0,1,1,0,b'Time Out - Behaviour Meter Live Behaviour Me...,control:use-limitation,control
3,3.9,1,0,1,0,"b""Call Tracker Call Tracker helps you identify...",callerid:misc,callerid
4,4.0,1,0,0,1,"b""All Call Location Tracker All Calls Location...",callerid:location,callerid


In [28]:
train_data, test_data = train_test_split(data, test_size = .25, 
                                         random_state = 28, 
                                         stratify = list(data['category']))

In [29]:
train_text = train_data['text']
test_text = test_data['text']

In [30]:
stopwords = ["a", "about", "across", "after", "afterwards", "again", "all", 
             "almost", "alone", "along", "already", "also","although","always","am","among", "amongst",
             "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway",
             "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", 
             "becoming", "been", "before", "beforehand", "behind", "being", "beside", "besides", 
             "between", "beyond", "bill", "both", "but", "by", "call", "can", "cannot", "cant", 
             "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
             "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", 
             "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", 
             "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", 
             "forty", "found", "four", "from", "further", "get", "give", "go", "had", "has", 
             "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", 
             "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", 
             "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less",
             "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", 
             "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", 
             "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", 
             "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
             "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed",
             "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", 
             "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system",
             "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", 
             "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those",
             "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards",
             "twelve", "twenty", "two", "un", "until", "upon", "us", "very", "via", "was", "we", "well", 
             "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein",
             "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", 
             "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", 
             "the"]



In [31]:
tfidf = tfidf(stop_words = stopwords, min_df = 5)

In [32]:
vocabulary = tfidf.fit(train_text.values)

In [33]:
train_text_matrix = vocabulary.transform(train_text.values)

In [34]:
train_text_matrix = train_text_matrix.toarray()

In [35]:
train_text_matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
test_text_matrix = vocabulary.transform(test_text.values)

In [37]:
test_text_matrix = test_text_matrix.toarray()

In [38]:
tfidf_test = pd.DataFrame(test_text_matrix)
tfidf_train = pd.DataFrame(train_text_matrix)

In [39]:
train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [40]:
train_data = pd.concat([train_data, tfidf_train], axis = 1)
test_data = pd.concat([test_data, tfidf_test], axis = 1)

In [41]:
train_data.head()

Unnamed: 0,index,score,not_privacy,privacy,not_discontinued,discontinued,text,true label,category,0,...,3488,3489,3490,3491,3492,3493,3494,3495,3496,3497
0,292,4.3,1,0,1,0,b'GPS Tracker By FollowMee Locate and Track Yo...,surveillance:location,surveillance,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,688,4.2,1,0,1,0,"b'Message Spy Remover (Anti Spy) Detect, Ident...",defense:anti-surveillance,defense,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,3.9,1,0,1,0,"b""Unlock Phone ( Unlock Codes ) Unlock your ce...",tutorial:settings,tutorial,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,236,4.3,1,0,1,0,"b""Equalizer music player booster The best musi...",none:misc,none,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,290,4.3,1,0,1,0,b'Fake Call & SMS Make prank call or SMS and t...,spoof:burner-phone,spoof,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
train_labels = train_data['category'].values
test_labels = test_data['category'].values
train_data = train_data.drop(['text', 'true label', 'category', 'index'], axis = 1)
test_data = test_data.drop(['text', 'true label', 'category', 'index'], axis = 1)

In [43]:
train_data.head()

Unnamed: 0,score,not_privacy,privacy,not_discontinued,discontinued,0,1,2,3,4,...,3488,3489,3490,3491,3492,3493,3494,3495,3496,3497
0,4.3,1,0,1,0,0.0,0.0,0.0,0.0,0.021401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.2,1,0,1,0,0.0,0.0,0.0,0.0,0.0305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.9,1,0,1,0,0.0,0.0,0.0,0.0,0.035811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.3,1,0,1,0,0.0,0.0,0.0,0.0,0.020444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.3,1,0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
train_data = train_data.values
test_data = test_data.values

In [46]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
lr.score(test_data, test_labels)

0.7435897435897436

In [48]:
lr.score(train_data, train_labels)

0.8325183374083129

In [49]:
predictions = lr.predict(test_data)

In [65]:
precision(test_labels, predictions, average = 'macro')

0.6842356739305891

In [66]:
recall(test_labels, predictions, average = 'macro')

0.4138886173311871

In [67]:
f1_score(test_labels, predictions, average = 'macro')

0.4717869232113626