In [377]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from urllib.parse import urlparse
import math
import json
import langid
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.model_selection import cross_validate as cv
from sklearn.model_selection import RandomizedSearchCV as rs

In [2]:
data = pd.read_csv('1000_labeled_and_features.csv')

In [3]:
data.columns

Index(['appId', 'true label', 'category', 'id', 'title', 'summary', 'icon',
       'price', 'free', 'minInstalls', 'maxInstalls', 'score', 'reviews',
       'developer', 'developerId', 'developerEmail', 'developerWebsite',
       'updated', 'version', 'genre', 'genreId', 'familyGenre',
       'familyGenreId', 'size', 'description', 'descriptionHTML', 'histogram',
       'offersIAP', 'adSupported', 'androidVersionText', 'androidVersion',
       'contentRating', 'screenshots', 'video', 'comments', 'recentChanges',
       'preregister', 'url', 'appId.1', 'similar', 'permissions', 'time',
       'lastseen', 'discontinued', 'installs', 'scoreText', 'ratings',
       'currency', 'priceText', 'developerAddress', 'privacyPolicy',
       'headerImage', 'videoImage', 'contentRatingDescription', 'released'],
      dtype='object')

In [4]:
data = data.head(1091)

In [5]:
data = data.drop(['id', 'price', 'free', 'minInstalls', 'maxInstalls', 'developer', 'developerId', 'developerEmail', 'genreId', 
'familyGenre', 'familyGenreId', 'size', 'histogram', 'offersIAP', 'adSupported', 'androidVersionText','preregister', 
'time', 'lastseen', 'currency', 'priceText', 'developerAddress', 'contentRatingDescription', 'released',
'updated', 'version', 'descriptionHTML', 'video', 'installs', 'scoreText', 'ratings', 'videoImage', 'appId.1', 
'androidVersion'], axis =1)
           

In [6]:
data.head()

Unnamed: 0,appId,true label,category,title,summary,icon,score,reviews,developerWebsite,genre,...,contentRating,screenshots,comments,recentChanges,url,similar,permissions,discontinued,privacyPolicy,headerImage
0,a2ndappwhats.sdkw.com,surveillance:social-media,surveillance,2nd Account for Whatsapp,2 Whatsapps on Same Device | Same Whatsapps on...,//lh3.googleusercontent.com/nCOacYEXACIuhJWb9J...,4.0,6939.0,http://jh-jewelry.in/sureshkheni.html,Communication,...,Teen,"[""//lh3.googleusercontent.com/e32FVC6gtcvUj1B1...","[""Very good applications but not suppose to al...","[""Bug Fixes""]",https://play.google.com/store/apps/details?id=...,"[""com.lbe.parallel.intl"", ""com.app.tiki.multim...","[""read the contents of your USB storage"", ""mod...",20171111:1135,,
1,air.au.com.metro.DumbWaysToDie2,none:misc,none,Dumb Ways to Die 2: The Games,A new set of dumb characters are here to take ...,//lh3.googleusercontent.com/-ckfFIgLg_qkKbVVgv...,4.2,1584158.0,http://dumbwaystodie.com,Casual,...,Teen,"[""//lh3.googleusercontent.com/G_-nQwn58EEoZbof...","[""It's a good game, and there's variety, but t...","[""AMERICALAND"", ""We\u2019re celebrating the Fo...",https://play.google.com/store/apps/details?id=...,"[""com.popreach.dumbways"", ""com.zynga.looney"", ...","[""find accounts on the device"", ""read the cont...",,,
2,air.com.applauz.timeoutkids,control:use-limitation,control,Time Out - Behaviour Meter,Live Behaviour Meter companion app for &quot;T...,https://lh3.googleusercontent.com/uRLr8icEKEhV...,4.5,0.0,http://www.applauz-media.com/timeout,Parenting,...,Everyone,"[""https://lh3.googleusercontent.com/Fon1L3syRL...",[],,https://play.google.com/store/apps/details?id=...,"[""com.jeesmon.malayalambible"", ""com.bn.speacki...","[""full network access""]",,http://applauz-media.com/privacypolicy,https://lh3.googleusercontent.com/3_G29zRlpVOA...
3,ajx.com.calltracker,callerid:misc,callerid,Call Tracker,Call Tracker helps you identify numbers real-t...,//lh3.googleusercontent.com/RQi_G-l3CadNwi4PNc...,3.9,71.0,http://www.optcrm.com/privacypolicy.html,Tools,...,Everyone,"[""//lh3.googleusercontent.com/kIVoX7kgHvKld7LP...","[""Thanks"", ""Chutiya App and which people has c...","[""Improved search result""]",https://play.google.com/store/apps/details?id=...,"[""com.truecaller"", ""com.caller.id.location.gps...","[""read your contacts"", ""modify your contacts"",...",,,
4,allcall.location.tracker,callerid:location,callerid,All Call Location Tracker,All Calls Location Tracker shows the location ...,//lh4.ggpht.com/GJHw8FTW0PqpDnLoHDJvg1MIQHZwjV...,4.0,3406.0,,Communication,...,Everyone,"[""//lh3.googleusercontent.com/UZ80Bvz4FwbnfU4I...","[""Very Slow and looks like a spam :-("", ""Too m...",[],https://play.google.com/store/apps/details?id=...,"[""com.caller.id.location.gps.maps.phone.number...","[""read sensitive log data"", ""read your Web boo...",20171214:1228,,


In [7]:
data['discontinued'] = data['discontinued'].notnull().astype('int')
data['privacyPolicy'] = data['privacyPolicy'].notnull().astype('int')

In [8]:
data.columns

Index(['appId', 'true label', 'category', 'title', 'summary', 'icon', 'score',
       'reviews', 'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'discontinued', 'privacyPolicy', 'headerImage'],
      dtype='object')

In [9]:
discontinued = pd.get_dummies(data['discontinued'])
privacy = pd.get_dummies(data['privacyPolicy'])

In [10]:
privacy.columns = ['not_privacy', 'privacy']
discontinued.columns = ['not_discontinued', 'discontinued']

In [11]:
discontinued.head()

Unnamed: 0,not_discontinued,discontinued
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [12]:
data = data.drop(['discontinued', 'privacyPolicy'], axis  = 1)

In [13]:
data = pd.concat([data, privacy, discontinued], axis = 1)

In [14]:
data.columns

Index(['appId', 'true label', 'category', 'title', 'summary', 'icon', 'score',
       'reviews', 'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'headerImage', 'not_privacy', 'privacy',
       'not_discontinued', 'discontinued'],
      dtype='object')

In [15]:
labels = data[['true label', 'category']]

In [16]:
labels.head()

Unnamed: 0,true label,category
0,surveillance:social-media,surveillance
1,none:misc,none
2,control:use-limitation,control
3,callerid:misc,callerid
4,callerid:location,callerid


In [17]:
data = data.drop(['true label', 'category'], axis = 1)

In [18]:
data = data.fillna(0)

In [19]:
data.head()

Unnamed: 0,appId,title,summary,icon,score,reviews,developerWebsite,genre,description,contentRating,...,comments,recentChanges,url,similar,permissions,headerImage,not_privacy,privacy,not_discontinued,discontinued
0,a2ndappwhats.sdkw.com,2nd Account for Whatsapp,2 Whatsapps on Same Device | Same Whatsapps on...,//lh3.googleusercontent.com/nCOacYEXACIuhJWb9J...,4.0,6939.0,http://jh-jewelry.in/sureshkheni.html,Communication,use & Control another WhatsApp with the same d...,Teen,...,"[""Very good applications but not suppose to al...","[""Bug Fixes""]",https://play.google.com/store/apps/details?id=...,"[""com.lbe.parallel.intl"", ""com.app.tiki.multim...","[""read the contents of your USB storage"", ""mod...",0,1,0,0,1
1,air.au.com.metro.DumbWaysToDie2,Dumb Ways to Die 2: The Games,A new set of dumb characters are here to take ...,//lh3.googleusercontent.com/-ckfFIgLg_qkKbVVgv...,4.2,1584158.0,http://dumbwaystodie.com,Casual,There’s a whole new set of dumb characters tha...,Teen,...,"[""It's a good game, and there's variety, but t...","[""AMERICALAND"", ""We\u2019re celebrating the Fo...",https://play.google.com/store/apps/details?id=...,"[""com.popreach.dumbways"", ""com.zynga.looney"", ...","[""find accounts on the device"", ""read the cont...",0,1,0,1,0
2,air.com.applauz.timeoutkids,Time Out - Behaviour Meter,Live Behaviour Meter companion app for &quot;T...,https://lh3.googleusercontent.com/uRLr8icEKEhV...,4.5,0.0,http://www.applauz-media.com/timeout,Parenting,"You already have our app ""Time Out - Time Out ...",Everyone,...,[],0,https://play.google.com/store/apps/details?id=...,"[""com.jeesmon.malayalambible"", ""com.bn.speacki...","[""full network access""]",https://lh3.googleusercontent.com/3_G29zRlpVOA...,0,1,1,0
3,ajx.com.calltracker,Call Tracker,Call Tracker helps you identify numbers real-t...,//lh3.googleusercontent.com/RQi_G-l3CadNwi4PNc...,3.9,71.0,http://www.optcrm.com/privacypolicy.html,Tools,Call Tracker caller ID helps you identify numb...,Everyone,...,"[""Thanks"", ""Chutiya App and which people has c...","[""Improved search result""]",https://play.google.com/store/apps/details?id=...,"[""com.truecaller"", ""com.caller.id.location.gps...","[""read your contacts"", ""modify your contacts"",...",0,1,0,1,0
4,allcall.location.tracker,All Call Location Tracker,All Calls Location Tracker shows the location ...,//lh4.ggpht.com/GJHw8FTW0PqpDnLoHDJvg1MIQHZwjV...,4.0,3406.0,0,Communication,All Calls Location Tracker shows the location ...,Everyone,...,"[""Very Slow and looks like a spam :-("", ""Too m...",[],https://play.google.com/store/apps/details?id=...,"[""com.caller.id.location.gps.maps.phone.number...","[""read sensitive log data"", ""read your Web boo...",0,1,0,0,1


In [20]:
data.columns

Index(['appId', 'title', 'summary', 'icon', 'score', 'reviews',
       'developerWebsite', 'genre', 'description', 'contentRating',
       'screenshots', 'comments', 'recentChanges', 'url', 'similar',
       'permissions', 'headerImage', 'not_privacy', 'privacy',
       'not_discontinued', 'discontinued'],
      dtype='object')

In [21]:
text = []
for el in data.values:
    all_string = el[1] + ' ' + el[2] + ' ' + el[7] + ' ' + el[8]
    if type(el[11]) == type('a'):
        try:
            for comment in json.loads(el[11]):
                all_string += ' ' + comment
        except:
            all_string += ' ' + el[11]
    text.append(all_string.encode('utf-8'))

In [22]:
data = data.drop(['title', 'summary', 'description', 'contentRating', 'comments', 'recentChanges', 'permissions'], axis = 1)

In [23]:
text = pd.DataFrame(text, columns = ['text'])

In [24]:
data = pd.concat([data, text], axis = 1)

In [25]:
data = data.drop(['icon', 'reviews', 'developerWebsite', 'genre', 
                  'screenshots', 'url', 'similar', 'headerImage', 'score', 
                  'not_privacy', 'privacy', 'not_discontinued', 'discontinued'], axis = 1)

In [26]:
data = pd.concat([data, labels], axis = 1)

In [27]:
data.head()

Unnamed: 0,appId,text,true label,category
0,a2ndappwhats.sdkw.com,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance:social-media,surveillance
1,air.au.com.metro.DumbWaysToDie2,"b""Dumb Ways to Die 2: The Games A new set of d...",none:misc,none
2,air.com.applauz.timeoutkids,b'Time Out - Behaviour Meter Live Behaviour Me...,control:use-limitation,control
3,ajx.com.calltracker,"b""Call Tracker Call Tracker helps you identify...",callerid:misc,callerid
4,allcall.location.tracker,"b""All Call Location Tracker All Calls Location...",callerid:location,callerid


In [28]:
train_data, test_data = train_test_split(data, test_size = .25, 
                                         random_state = 28, 
                                         stratify = list(data['category']))

In [29]:
train_text = train_data['text']
test_text = test_data['text']

In [30]:
stopwords = ["a", "about", "across", "after", "afterwards", "again", "all", 
             "almost", "alone", "along", "already", "also","although","always","am","among", "amongst",
             "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway",
             "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", 
             "becoming", "been", "before", "beforehand", "behind", "being", "beside", "besides", 
             "between", "beyond", "bill", "both", "but", "by", "call", "can", "cannot", "cant", 
             "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
             "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", 
             "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", 
             "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", 
             "forty", "found", "four", "from", "further", "get", "give", "go", "had", "has", 
             "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", 
             "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", 
             "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less",
             "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", 
             "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", 
             "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", 
             "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
             "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed",
             "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", 
             "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system",
             "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", 
             "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those",
             "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards",
             "twelve", "twenty", "two", "un", "until", "upon", "us", "very", "via", "was", "we", "well", 
             "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein",
             "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", 
             "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", 
             "the"]



In [31]:
tfidf = tfidf(stop_words = stopwords, min_df = 5)

In [32]:
vocabulary = tfidf.fit(train_text.values)

In [33]:
train_text_matrix = vocabulary.transform(train_text.values)

In [34]:
train_text_matrix = train_text_matrix.toarray()

In [35]:
train_text_matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
test_text_matrix = vocabulary.transform(test_text.values)

In [37]:
test_text_matrix = test_text_matrix.toarray()

In [38]:
tfidf_test = pd.DataFrame(test_text_matrix)
tfidf_train = pd.DataFrame(train_text_matrix)

In [39]:
train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [40]:
train_data = pd.concat([train_data, tfidf_train], axis = 1)
test_data = pd.concat([test_data, tfidf_test], axis = 1)

In [41]:
train_data.head()

Unnamed: 0,index,appId,text,true label,category,0,1,2,3,4,...,3379,3380,3381,3382,3383,3384,3385,3386,3387,3388
0,292,com.fmee.fmeeserv,b'GPS Tracker By FollowMee Locate and Track Yo...,surveillance:location,surveillance,0.0,0.0,0.0,0.021414,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,688,com.skibapps.messagespyremover,"b'Message Spy Remover (Anti Spy) Detect, Ident...",defense:anti-surveillance,defense,0.0,0.0,0.0,0.032309,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,appinventor.ai_oscar_rdzmty7.UnlockPhonePlus_copy,"b""Unlock Phone ( Unlock Codes ) Unlock your ce...",tutorial:settings,tutorial,0.0,0.0,0.0,0.036652,0.040718,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,236,com.djit.equalizerplusforandroidfree,"b""Equalizer music player booster The best musi...",none:misc,none,0.0,0.0,0.0,0.021145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,290,com.flexsolution.fakecallsms,b'Fake Call & SMS Make prank call or SMS and t...,spoof:burner-phone,spoof,0.0,0.0,0.0,0.0,0.05428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
train_labels = train_data['category'].values
test_labels = test_data['category'].values
train_data = train_data.drop(['text', 'true label', 'category', 'index'], axis = 1)
test_data = test_data.drop(['text', 'true label', 'category', 'index'], axis = 1)

In [43]:
train_data.head()

Unnamed: 0,appId,0,1,2,3,4,5,6,7,8,...,3379,3380,3381,3382,3383,3384,3385,3386,3387,3388
0,com.fmee.fmeeserv,0.0,0.0,0.0,0.021414,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,com.skibapps.messagespyremover,0.0,0.0,0.0,0.032309,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,appinventor.ai_oscar_rdzmty7.UnlockPhonePlus_copy,0.0,0.0,0.0,0.036652,0.040718,0.0,0.0,0.0,0.051928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,com.djit.equalizerplusforandroidfree,0.0,0.0,0.0,0.021145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,com.flexsolution.fakecallsms,0.0,0.0,0.0,0.0,0.05428,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
train_data = train_data.drop(['appId'], axis = 1).values
test_data = test_data.drop(['appId'], axis = 1).values

In [45]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
lr.score(test_data, test_labels)

0.7655677655677655

In [47]:
lr.score(train_data, train_labels)

0.8447432762836186

In [48]:
predictions = lr.predict(test_data)

In [49]:
precision(test_labels, predictions, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.6904992947127778

In [50]:
recall(test_labels, predictions, average = 'macro')

0.4399946168367648

In [51]:
f1_score(test_labels, predictions, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.49913830061939296

In [99]:
unique_labels = list(set(test_labels))

In [173]:
rf = SGDClassifier(learning_rate = 'optimal', early_stopping = True, penalty = 'l2', loss = 'log')
rf.fit(train_data, train_labels)
rf.score(test_data, test_labels)

0.8095238095238095

In [174]:
rf_predictions = rf.predict(test_data)

In [175]:
precision(test_labels, rf_predictions, average =None, labels = unique_labels)

  'precision', 'predicted', average, warn_for)


array([0.        , 1.        , 0.83333333, 0.7       , 0.88235294,
       0.        , 0.84313725, 0.7       , 0.64705882, 0.7826087 ])

In [273]:
recall(test_labels, rf_predictions, average = 'macro')

ValueError: Found input variables with inconsistent numbers of samples: [164, 273]

In [177]:
f1_score(test_labels, rf_predictions, average = None, labels = unique_labels)

  'precision', 'predicted', average, warn_for)


array([0.        , 0.57142857, 0.66666667, 0.68292683, 0.90909091,
       0.        , 0.8989547 , 0.66666667, 0.57894737, 0.7826087 ])

In [178]:
cm(test_labels, rf_predictions, labels = unique_labels)

array([[  0,   0,   0,   0,   0,   0,   1,   0,   1,   0],
       [  0,   4,   0,   0,   0,   0,   4,   0,   2,   0],
       [  0,   0,   5,   0,   1,   0,   2,   0,   0,   1],
       [  0,   0,   0,  14,   1,   0,   3,   0,   0,   3],
       [  0,   0,   0,   0,  15,   0,   1,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,   1,   0,   0,   1],
       [  0,   0,   0,   1,   0,   0, 129,   1,   2,   1],
       [  0,   0,   0,   0,   0,   0,   2,   7,   0,   2],
       [  0,   0,   0,   2,   0,   0,   5,   1,  11,   2],
       [  0,   0,   1,   2,   0,   0,   5,   1,   1,  36]])

In [93]:
unique_labels

['control',
 'callerid',
 'harassment',
 'defense',
 'spoof',
 'evasion',
 'surveillance',
 'tutorial',
 'info-extraction',
 'none']

## Model Selection

In [108]:
data.head()

Unnamed: 0,appId,text,true label,category
0,a2ndappwhats.sdkw.com,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance:social-media,surveillance
1,air.au.com.metro.DumbWaysToDie2,"b""Dumb Ways to Die 2: The Games A new set of d...",none:misc,none
2,air.com.applauz.timeoutkids,b'Time Out - Behaviour Meter Live Behaviour Me...,control:use-limitation,control
3,ajx.com.calltracker,"b""Call Tracker Call Tracker helps you identify...",callerid:misc,callerid
4,allcall.location.tracker,"b""All Call Location Tracker All Calls Location...",callerid:location,callerid


In [117]:
jaco = tfidf(stop_words = stopwords, min_df = 5)
vocabulary = jaco.fit(data.text.values)
all_labeled_data = vocabulary.transform(data.text.values)
all_labeled_data = all_labeled_data.toarray()

In [119]:
data = pd.concat([data, pd.DataFrame(all_labeled_data)], axis = 1)

In [153]:
workable_data = data.drop(['appId', 'text', 'true label', 'category'], axis = 1)
#workable_labels = label_binarize(data.category.values, classes = list(set(data.category.values)))
workable_labels = data.category.values

In [154]:
workable_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4031,4032,4033,4034,4035,4036,4037,4038,4039,4040
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096136,0.0,0.0,...,0.0,0.0,0.0,0.0,0.104762,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [181]:
lr = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')
rf = RandomForestClassifier(random_state = 28, n_estimators = 10, max_depth = 30)
sgd = SGDClassifier(learning_rate = 'optimal', early_stopping = True, penalty = 'l2', loss = 'log')
nb = MultinomialNB()
gnb = GaussianNB()
bnb = BernoulliNB()
classifiers = [lr, rf, sgd, nb, gnb, bnb]

In [183]:
for classifier in classifiers:
    print(cv(classifier, workable_data, workable_labels, cv = 5, scoring = ['f1_macro', 
                                                                            'precision_macro', 
                                                                            'recall_macro']))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.90408802, 0.88309503, 1.01952505, 0.89368796, 1.2008779 ]), 'score_time': array([0.01546001, 0.01393008, 0.03205991, 0.01433802, 0.03427386]), 'test_f1_macro': array([0.47668015, 0.5396781 , 0.55115593, 0.53133438, 0.5536105 ]), 'test_precision_macro': array([0.68542159, 0.72510396, 0.67129997, 0.63252963, 0.67551948]), 'test_recall_macro': array([0.41088045, 0.47196286, 0.51064792, 0.48001159, 0.49399827])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.09039378, 0.10471606, 0.13263202, 0.18938494, 0.17267704]), 'score_time': array([0.01559329, 0.02955008, 0.02418685, 0.02998805, 0.01983094]), 'test_f1_macro': array([0.31685069, 0.4166458 , 0.41318696, 0.43001814, 0.38115066]), 'test_precision_macro': array([0.55851397, 0.62502712, 0.57224603, 0.60744253, 0.54701929]), 'test_recall_macro': array([0.27371285, 0.35775074, 0.36429661, 0.39000595, 0.32715351])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([1.01831388, 1.12172127, 0.93912482, 0.77782702, 0.71575308]), 'score_time': array([0.02432394, 0.02788997, 0.01032805, 0.01015925, 0.01089096]), 'test_f1_macro': array([0.57933908, 0.60844719, 0.71203229, 0.64759565, 0.6045433 ]), 'test_precision_macro': array([0.65749472, 0.70566897, 0.77233186, 0.71598806, 0.67892184]), 'test_recall_macro': array([0.53515773, 0.55986716, 0.69218305, 0.61188516, 0.56733922])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.02024198, 0.02706623, 0.0644412 , 0.0685339 , 0.04116917]), 'score_time': array([0.015769  , 0.01879191, 0.01959395, 0.02164388, 0.02421474]), 'test_f1_macro': array([0.12994505, 0.18397757, 0.15848421, 0.18410698, 0.17753268]), 'test_precision_macro': array([0.14241486, 0.24482234, 0.24959184, 0.35376884, 0.34543551]), 'test_recall_macro': array([0.14594595, 0.18198198, 0.16388889, 0.1753268 , 0.17013889])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.13381004, 0.14830303, 0.13528395, 0.11153698, 0.08465815]), 'score_time': array([0.21042395, 0.19650507, 0.26577997, 0.13869405, 0.14177704]), 'test_f1_macro': array([0.20940229, 0.28302345, 0.2728355 , 0.33582727, 0.21402205]), 'test_precision_macro': array([0.31154691, 0.4098661 , 0.56167932, 0.61428091, 0.29838509]), 'test_recall_macro': array([0.21173164, 0.27354345, 0.24124949, 0.29736702, 0.20382269])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.06813383, 0.10046005, 0.06436777, 0.06857228, 0.06237793]), 'score_time': array([0.04230309, 0.05515075, 0.03435802, 0.0576117 , 0.03508997]), 'test_f1_macro': array([0.33787002, 0.47444734, 0.44449339, 0.39353898, 0.41596869]), 'test_precision_macro': array([0.44571522, 0.61684524, 0.59627158, 0.46817405, 0.51749257]), 'test_recall_macro': array([0.31299557, 0.42813815, 0.40277621, 0.35600393, 0.37155623])}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [127]:
import sklearn.metrics

sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

## Hyperparameter Tuning

In [338]:
data = data[['appId', 'text', 'category']]

In [339]:
data.head()

Unnamed: 0,appId,text,category
0,a2ndappwhats.sdkw.com,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance
1,air.au.com.metro.DumbWaysToDie2,"b""Dumb Ways to Die 2: The Games A new set of d...",none
2,air.com.applauz.timeoutkids,b'Time Out - Behaviour Meter Live Behaviour Me...,control
3,ajx.com.calltracker,"b""Call Tracker Call Tracker helps you identify...",callerid
4,allcall.location.tracker,"b""All Call Location Tracker All Calls Location...",callerid


In [340]:
hpt_data = data[['appId', 'text', 'category']]

In [347]:
hpt_data.head()

Unnamed: 0,appId,text,category
0,a2ndappwhats.sdkw.com,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance
1,air.au.com.metro.DumbWaysToDie2,"b""Dumb Ways to Die 2: The Games A new set of d...",none
2,air.com.applauz.timeoutkids,b'Time Out - Behaviour Meter Live Behaviour Me...,control
3,ajx.com.calltracker,"b""Call Tracker Call Tracker helps you identify...",callerid
4,allcall.location.tracker,"b""All Call Location Tracker All Calls Location...",callerid


In [348]:
train_data, test_data = train_test_split(hpt_data, test_size = .25, 
                                         random_state = 28, 
                                         stratify = list(hpt_data['category']))

In [349]:
print(len(train_data))
print(len(test_data))

818
273


In [350]:
train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [351]:
train_data.head()

Unnamed: 0,index,appId,text,category
0,292,com.fmee.fmeeserv,b'GPS Tracker By FollowMee Locate and Track Yo...,surveillance
1,688,com.skibapps.messagespyremover,"b'Message Spy Remover (Anti Spy) Detect, Ident...",defense
2,12,appinventor.ai_oscar_rdzmty7.UnlockPhonePlus_copy,"b""Unlock Phone ( Unlock Codes ) Unlock your ce...",tutorial
3,236,com.djit.equalizerplusforandroidfree,"b""Equalizer music player booster The best musi...",none
4,290,com.flexsolution.fakecallsms,b'Fake Call & SMS Make prank call or SMS and t...,spoof


In [352]:
tfidf_init = tfidf(stop_words = stopwords, min_df = 5)
vocabulary = tfidf_init.fit(train_data.text)
train_matrix = vocabulary.transform(train_data.text).toarray()
test_matrix = vocabulary.transform(test_data.text).toarray()

In [353]:
train_data = pd.concat([train_data, pd.DataFrame(train_matrix)], axis = 1)
test_data = pd.concat([test_data, pd.DataFrame(test_matrix)], axis = 1)

In [354]:
train_data.head()

Unnamed: 0,index,appId,text,category,0,1,2,3,4,5,...,3379,3380,3381,3382,3383,3384,3385,3386,3387,3388
0,292,com.fmee.fmeeserv,b'GPS Tracker By FollowMee Locate and Track Yo...,surveillance,0.0,0.0,0.0,0.021414,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,688,com.skibapps.messagespyremover,"b'Message Spy Remover (Anti Spy) Detect, Ident...",defense,0.0,0.0,0.0,0.032309,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,appinventor.ai_oscar_rdzmty7.UnlockPhonePlus_copy,"b""Unlock Phone ( Unlock Codes ) Unlock your ce...",tutorial,0.0,0.0,0.0,0.036652,0.040718,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,236,com.djit.equalizerplusforandroidfree,"b""Equalizer music player booster The best musi...",none,0.0,0.0,0.0,0.021145,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,290,com.flexsolution.fakecallsms,b'Fake Call & SMS Make prank call or SMS and t...,spoof,0.0,0.0,0.0,0.0,0.05428,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [355]:
train_labels = train_data.category.values
test_labels = test_data.category.values

In [356]:
train_id = train_data.appId
test_id = test_data.appId

In [357]:
train_data = train_data.drop(['index', 'appId', 'text', 'category'], axis = 1)
test_data = test_data.drop(['index', 'appId', 'text', 'category'], axis = 1)

In [358]:
params = {
    'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [.000001, .00001, .0001, .0005, .001, .005, .01, .05],
    'l1_ratio': [0, .15, .3, .45, .6, .75, .9, 1],
    'fit_intercept': [True, False],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [.001]
}

In [359]:
len(params['loss'])*len(params['penalty'])*len(params['alpha'])*len(params['l1_ratio'])*len(params['fit_intercept'])*len(params['learning_rate'])

6144

In [360]:
sgd = SGDClassifier()

In [361]:
random_search = rs(sgd, params, n_iter = 30, scoring = 'recall_macro', cv = 3, random_state = 28)

In [362]:
random_search.fit(train_data.values, train_labels)



RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=SGDClassifier(alpha=0.0001, average=False,
                                           class_weight=None,
                                           early_stopping=False, epsilon=0.1,
                                           eta0=0.0, fit_intercept=True,
                                           l1_ratio=0.15,
                                           learning_rate='optimal',
                                           loss='hinge', max_iter=1000,
                                           n_iter_no_change=5, n_jobs=None,
                                           penalty='l2', power_t=0.5,
                                           random_state=None, shuffle=True,
                                           tol=0.001, va...
                                        'eta0': [0.001],
                                        'fit_intercept': [True, False],
                                        'l1_rati

In [363]:
best_sgd = random_search.best_estimator_

In [364]:
best_sgd.fit(train_data, train_labels)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.75, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [366]:
best_sgd.score(test_data, test_labels)

0.7948717948717948

In [368]:
sgd_predictions = best_sgd.predict(test_data)

In [370]:
precision(test_labels, sgd_predictions, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.6588745083517088

In [371]:
recall(test_labels, sgd_predictions, average = 'macro')

0.6346025729624042

In [372]:
f1_score(test_labels, sgd_predictions, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.6400802081860059

In [374]:
cm(test_labels, sgd_predictions, labels = list(set(test_labels)))

array([[  0,   0,   0,   0,   0,   0,   0,   0,   2,   0],
       [  0,   5,   0,   0,   0,   0,   3,   0,   2,   0],
       [  0,   0,   6,   0,   1,   0,   1,   0,   0,   1],
       [  0,   0,   1,  15,   1,   0,   3,   0,   0,   1],
       [  0,   0,   0,   1,  14,   0,   1,   0,   0,   0],
       [  0,   0,   1,   0,   0,   2,   0,   0,   0,   0],
       [  0,   1,   1,   1,   1,   0, 121,   1,   6,   2],
       [  0,   0,   0,   1,   0,   0,   1,   7,   0,   2],
       [  0,   0,   0,   2,   0,   0,   1,   2,  14,   2],
       [  0,   1,   1,   3,   1,   0,   4,   2,   1,  33]])

In [375]:
with open('allMktData.txt', 'r') as f:
    app_data = []
    for line in f:
        app_data.append(line.split('\t'))

In [376]:
app_data[0]

['esp.espana.chat',
 'Chat España: Chatear, ligar y conocer gente',
 'Chat citas españa y conocer gente nueva en la app de citas de chat amor españa es un hobby. Busca chat hot fotos en la mejor pagina para buscar pareja. Novedoso sistema de chat citas españa y buscar pareja gratis. Entra a nuestro chat app y encuentra citas gratis para conocer gente y mirar chat hot fotos en línea. Además, encontrar pareja +40 en este chat citas españa, utilizando frases para ligar del 2018<br><br>¡Es súper fácil acceder en el chat hot app gratis y conocer gente en españa! Descárgala y empieza a ligar mujeres en la sección de chat hot fotos. Es una manera instantánea para buscar pareja gratis y ligar en Madrid. Es un chat ligar españa y encontrar pareja +15 más famoso<br><br>Chat España es una red social que ha crecido y que tiene más de 80k de usuarios actualmente para chat hot fotos. Es una de las &quot;chat citas españa&quot; donde buscar pareja gratis es más fácil con los juegos para buscar novia 