In [33]:
import numpy as np
import pandas as pd
import gensim
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from gensim.utils import simple_preprocess
import spacy
from gensim.models import CoherenceModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nissani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('/Users/nissani/Desktop/IPV/concatenated_text.csv')

In [3]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [4]:
data.head()

Unnamed: 0,appId,text,true label,category
0,a2ndappwhats.sdkw.com,"b""2nd Account for Whatsapp 2 Whatsapps on Same...",surveillance:social-media,surveillance
1,air.au.com.metro.DumbWaysToDie2,"b""Dumb Ways to Die 2: The Games A new set of d...",none:misc,none
2,air.com.applauz.timeoutkids,b'Time Out - Behaviour Meter Live Behaviour Me...,control:use-limitation,control
3,ajx.com.calltracker,"b""Call Tracker Call Tracker helps you identify...",callerid:misc,callerid
4,allcall.location.tracker,"b""All Call Location Tracker All Calls Location...",callerid:location,callerid


In [5]:
stop_words = stopwords.words('english')
stop_words.append('phone')
stop_words.append('device')
stop_words.append('app')
stop_words.append('application')

In [6]:
text_data = list(data['text'])

In [7]:
text_data = [re.sub('\S*@\S*\s?', '', sent) for sent in text_data]

In [8]:
text_data = [re.sub('\s+', ' ', sent) for sent in text_data]

In [9]:
text_data = [re.sub("\'", "", sent) for sent in text_data]

In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [11]:
text_data_list = sent_to_words(text_data)

In [12]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
no_stopwords_data = remove_stopwords(text_data_list)

In [14]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [15]:
lemmatized_data = lemmatization(no_stopwords_data)

In [16]:
id2word = corpora.Dictionary(lemmatized_data)

In [17]:
texts = lemmatized_data

In [18]:
corpus = [id2word.doc2bow(text) for text in texts]

In [29]:
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

In [30]:
lda_model

<gensim.models.ldamodel.LdaModel at 0x123efbcd0>

In [31]:
lda_model.print_topics()

[(0,
  '0.056*"video" + 0.045*"hide" + 0.041*"lock" + 0.037*"camera" + 0.036*"photo" + 0.021*"screen" + 0.018*"password" + 0.017*"private" + 0.016*"unlock" + 0.016*"privacy"'),
 (1,
  '0.018*"use" + 0.015*"file" + 0.015*"work" + 0.008*"delete" + 0.008*"feature" + 0.007*"support" + 0.007*"android" + 0.007*"find" + 0.007*"download" + 0.007*"great"'),
 (2,
  '0.018*"work" + 0.016*"use" + 0.016*"time" + 0.014*"get" + 0.014*"friend" + 0.014*"whatsapp" + 0.013*"make" + 0.011*"see" + 0.011*"free" + 0.010*"account"'),
 (3,
  '0.018*"use" + 0.011*"tool" + 0.011*"control" + 0.010*"datum" + 0.010*"information" + 0.009*"spy" + 0.009*"monitor" + 0.009*"system" + 0.008*"app" + 0.008*"access"'),
 (4,
  '0.207*"call" + 0.062*"number" + 0.062*"record" + 0.027*"caller" + 0.026*"would" + 0.022*"recorder" + 0.021*"incoming" + 0.019*"recording" + 0.014*"contact" + 0.013*"free"'),
 (5,
  '0.122*"message" + 0.060*"text" + 0.059*"send" + 0.029*"number" + 0.029*"email" + 0.027*"receive" + 0.024*"sms" + 0.023*"

In [40]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized_data, dictionary=id2word, coherence='c_v')

In [41]:
coherence_model_lda.get_coherence()

0.42774197388184343