## Setting up 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim as gs
import slugify as sg
import nltk


Slow version of gensim.models.doc2vec is being used


In [2]:
data = pd.read_table('SMSSpamCollection',header= None, names = ('outcome', 'content'))

### 1. Feature Selection

In [3]:
# lower case it
clean = data["content"] = list(data["content"].str.lower())

# stopwords setup
words_to_remove = set('for a of the and to in or'.split())
# words_to_remove = set(nltk.corpus.stopwords.words('english'))

# this will tokenize
clean = [[word for word in document.split()] for document in clean]

from re import sub # import sub to replace items in the followiong list comprehension
x = [[sub(pattern='[a-z0-9]',repl='', string=w) for w in x] for x in clean] # remove all regular alphanumeric and integers
x = set([item for sublist in x for item in sublist]) # list comp. to flatten
x = set(list(''.join(x))) # first, concatenate all characters, then use list to separate, then set to gain uniques
x = '\\'+'|\\'.join(x)


In [4]:
# remove symbols
# improvement: "raw" by r"... will let you use any characters inside the pattern argument.
clean = [[sub(x,'',word) for word in text] for text in clean]
# remove common words
clean = [[word for word in document if word not in words_to_remove] for document in clean]


In [5]:
# Count Freqs

from collections import defaultdict
frequency = defaultdict(int)
for text in clean:
    for token in text:
        frequency[token] += 1

# get freq > 1
clean = [[x for x in words if frequency[x] > 1] for words in clean]



### 3. Create Dictionary 

In [6]:
dictionary = gs.corpora.Dictionary(clean)
# test
dictionary.doc2bow("'for a of the and to in or".split())



[(6, 1), (27, 1), (30, 1), (74, 1), (143, 1), (148, 1), (346, 1)]

### 4. Create Corpus

In [7]:
corpus = [dictionary.doc2bow(text) for text in clean]

### 5. Create xform

In [8]:
tfidf = gs.models.TfidfModel(corpus) 
corp_tf = tfidf[corpus] 

In [9]:
lsi = gs.models.LsiModel(corp_tf, id2word=dictionary, num_topics = 300)
corp_topics = lsi[corp_tf] 

### create dataframe

In [10]:
# training and test set
X = pd.DataFrame([dict(row) for row in corp_topics]) 
Y = (data["outcome"] == "spam").astype(int) 
from sklearn.lda import LDA
lda = LDA()
mask = np.array([~np.isnan(row).any() for row in X.values])
lda = lda.fit(X[mask], Y[mask])



In [11]:

sum(lda.predict(X[mask]) == Y[mask])/len(Y[mask]) 

0.97770986877584043

In [12]:
new_doc = ["hey dude where are you",
           "text 444 for a promotional treat",
           "dont know what time it is", 
           "Our records indicate your Pension is under performing to see higher growth and up to 25% cash release reply PENSION for a free review. To opt out reply STOP",
          "To start the process please reply YES. To opt out text STOP",
          "i'm going to be 10 mins late"]

new_doc = [sub(x,'',str.lower(line)) for line in new_doc]

query = [[new_word for new_word in document.split()] for document in new_doc]
query_bow = [dictionary.doc2bow(corp) for corp in query]
query_tf = tfidf[query_bow] 
x_2 = pd.DataFrame([dict(tf) for tf in lsi[query_tf]])
lda.predict(x_2)

array([0, 1, 0, 1, 1, 0])

In [13]:
#%qtconsole