In [5]:
%matplotlib inline

# Basic
import io
import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices

# SKLearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

# NLTK
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [6]:
data = pd.read_csv("training-data/spamcollectiondata.tsv", sep='\t', names = ["Category", "Message"])

In [7]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
message_data = [word.lower() for word in data['Message']]
category = data['Category'].tolist()
stop = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
training_set = []
i = 0
for message in message_data:
    sentence = message.split(" ")
    filtered = []
    pr = []
    for word in sentence:
        if word.lower() not in stop:
            stemmed = stemmer.stem(word)
            filtered.append(stemmed)
    pr.append(filtered)
    pr.append(category[i])
    training_set.append(pr)
    i = i+1
    
train_df = pd.DataFrame(training_set)
train_df.columns = ['Lists','Labels']
train_df.head()
y = np.array(train_df.Labels)

for i in range(len(y)):
    if y[i] == 'ham':
        y[i] = 1
    else:
        y[i] = 0
        
y = np.ravel(y)

In [23]:
def list_to_dict(words_list):
  return dict([(word, True) for word in words_list])
 
training_set_formatted = [(list_to_dict(element[0]), element[1]) for element in training_set]

def generate_words_vector(training_set):
    words_vector = [] 
    for review in training_set:
        for word in review[0]:
            if word not in words_vector: words_vector.append(word) 
    return words_vector

def generate_X_matrix(training_set, words_vector):
    no_reviews = len(training_set)
    no_words = len(words_vector)
    X = np.zeros(shape=(no_reviews, no_words+1))
    for ii in range(0,no_reviews):
        X[ii][0] = 1
        review_text = training_set[ii][0]
        total_words_in_review = len(review_text)
        rt = list(review_text)
        for word in rt:
            word_occurences = rt.count(word)
            word_index = words_vector.index(word)+1
            X[ii][word_index] = word_occurences / float(total_words_in_review)
    return X

words_vector = generate_words_vector(training_set_formatted)
X = np.array(generate_X_matrix(training_set_formatted, words_vector))

In [24]:
print(X)

[[ 1.          0.0625      0.0625     ...,  0.          0.          0.        ]
 [ 1.          0.          0.         ...,  0.          0.          0.        ]
 [ 1.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 1.          0.          0.         ...,  0.16666667  0.16666667  0.        ]
 [ 1.          0.          0.         ...,  0.          0.          0.        ]
 [ 1.          0.          0.         ...,  0.          0.          0.33333333]]


In [25]:
print(y)

[1 1 0 ..., 1 1 1]


In [27]:
# What percentage is ham?
print(y.mean())

0.8659368269921034


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, train_df.Labels, test_size=0.3, random_state=0)
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
predicted = model.predict(X_test)
print(predicted)

['ham' 'ham' 'ham' ..., 'ham' 'ham' 'ham']


In [45]:
probs = model.predict_proba(X_test)
print(probs)

[[ 0.86990832  0.13009168]
 [ 0.76532914  0.23467086]
 [ 0.85426455  0.14573545]
 ..., 
 [ 0.87185066  0.12814934]
 [ 0.86848991  0.13151009]
 [ 0.86817002  0.13182998]]


In [40]:
print(pd.DataFrame(predicted).describe())

           0
count   1672
unique     2
top      ham
freq    1669


In [42]:
print(pd.DataFrame(y_test).describe())

       Labels
count    1672
unique      2
top       ham
freq     1451


In [46]:
print(1451.0/1672,1669.0/1672)

0.867822966507177 0.9982057416267942
