#### 1. Importing Libraries and Modules

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim as gs
#import slugify as sg
import nltk
from re import sub # import sub to replace items in the followiong list comprehension
from collections import defaultdict
from sklearn.lda import LDA
import matplotlib.pyplot as plt
import re
from scipy.stats import ttest_ind


Slow version of gensim.models.doc2vec is being used


#### 2. Read Data, Split into "Train", "Test", "Validate"

In [2]:
data = pd.read_table('SMSSpamCollection',header= None, names = ('outcome', 'content'))

In [3]:
n = 3000
train = data.sample(n)
test = data[~data.index.isin(train.index)]
validate = test.sample(1000)
test= test[~test.index.isin(validate.index)]
split_correctly = 0 == sum(validate.index.isin(test.index)) + sum(test.index.isin(train.index)) + sum(validate.index.isin(train.index))
set_n_sizes = 'N\'s in .. train:', train.shape,'test:', test.shape,'validate:', validate.shape

print(
    'Data Split Correct?',
    split_correctly,
    '\n'*2,
    set_n_sizes)

Data Split Correct? True 

 ("N's in .. train:", (3000, 2), 'test:', (1572, 2), 'validate:', (1000, 2))


#### 3. Data Preprocessing

In [4]:
def prep_nlp(data_to_prep, stop_words_in):
    # lower case it
    clean = list(data_to_prep.str.lower())
    # this will tokenize
    clean = [[word for word in document.split()] for document in clean]
    #stopwords_set1 = set(nltk.corpus.stopwords.words('english'))
    words_to_remove = '|'.join(stopwords_set3)
    symbol_remover = '[^A-Za-z0-9]+'
    clean = [[sub(symbol_remover,'',word) for word in text] for text in clean]
    clean = [[sub(words_to_remove,'',word) for word in text] for text in clean]
    return clean

In [5]:
stopwords_set2 = set('for a of the and to in or'.split())
stopwords_set3 = ''
train_prepped = prep_nlp(data_to_prep = train.content, stop_words_in= stopwords_set2)

#### Data Exploration /  Feature Selection
[Contents Located Here](https://github.com/chrisgian/SB8-Statistical-Inference/blob/master/8.6%20-%20inference%20on%20capstone/8.6%20Apply%20inferential%20statistics%20to%20Capstone%20Project.ipynb) 

#### 4. Build unsupervised Model

In [6]:
def build_model(train_data, topic_n):
    frequency = defaultdict(int)
    for text in train_data:
        for token in text:
            frequency[token] += 1
    # get freq > 1
    word_freq_1plus = [[x for x in words if frequency[x] > 1] for words in train_data]
    # Create dictionary
    dictionary = gs.corpora.Dictionary(word_freq_1plus)
    # Create Corpus
    corpus = [dictionary.doc2bow(text) for text in train_data]
    # corpus to tfidf
    tfidf = gs.models.TfidfModel(corpus) 
    corp_tf = tfidf[corpus] 
    # Unsupervised Component. Reduce space into 300 topics. 
    topic_n = topic_n
    lsi = gs.models.LsiModel(corp_tf, id2word=dictionary, num_topics = topic_n)
    corp_topics = lsi[corp_tf] 
    return corp_topics, dictionary, tfidf, lsi    

In [7]:
built_model = build_model(train_data = train_prepped, topic_n = 300)

#### 5. Build supervised Model

In [8]:
def train_model(topic_vec):
    x = pd.DataFrame([dict(row) for row in topic_vec[0]])
    y = (train["outcome"] == "spam").astype(int) 
    lda = LDA()
    mask = np.array([~np.isnan(row).any() for row in x.values])
    x_masked = x[mask]
    y_masked = y[mask]
    lda = lda.fit(x_masked,y_masked)
    return lda,x_masked,y_masked, topic_vec[1],topic_vec[2], topic_vec[3]

In [9]:
trained_model = train_model(topic_vec  = built_model)
sum(trained_model[0].predict(trained_model[1])==trained_model[2])/len(trained_model[2]) 

0.98329435349148009

#### 6. Test Model on Unseen Data

In [10]:
def predict_unseen(new_doc_in, stop_words_in, trained_model_in):
    
    dictionary_in = trained_model_in[3]
    tfidf_in = trained_model_in[4]
    lsi_in = trained_model_in[5]
    lda_in = trained_model_in[0]
    new_doc_in_content = pd.Series(new_doc_in.content)
    new_doc_in_outcome = pd.Series(new_doc_in.outcome)
    
    query = prep_nlp(new_doc_in_content, stop_words_in= stop_words_in)
    query_bow = [dictionary_in.doc2bow(corp) for corp in query]
    query_tf = tfidf_in[query_bow] 
    
    x_2 = pd.DataFrame([dict(tf) for tf in lsi_in[query_tf]])
    mask = np.array([~np.isnan(row).any() for row in x_2.values])
    x_2masked = x_2[mask]
    y_2 = (new_doc_in_outcome == "spam").astype(int) 
    
    y_2masked = np.array(y_2[mask])
    x_2masked = lda_in.predict(x_2masked)
    
    return x_2masked,y_2masked


In [11]:
new_doc = ["hey dude where are you",
           "text 444 for a promotional treat",
           "dont know what time it is", 
           "Our records indicate your Pension is under performing to see higher growth and up to 25% cash release reply PENSION for a free review. To opt out reply STOP",
          "To start the process please reply YES. To opt out text STOP",
          "i'm going to be 10 mins late"]
new_doc_results = ['ham','spam','ham','spam','spam','ham']
external_data = pd.DataFrame({'content':new_doc, 'outcome':new_doc_results})

predicted_test = predict_unseen(new_doc_in=test, stop_words_in = stopwords_set2, trained_model_in = trained_model)
predicted_external = predict_unseen(new_doc_in=external_data, stop_words_in = stopwords_set2, trained_model_in = trained_model)

##### 7.Performance
There are three performance metrics:
1. "Accuracy" which tells us, what percent of predicted results equal the actual results
2. "Precision": Of all all observations we predicted as spam, what is actually spam?
3. "Recall": Of all observations actually spam, what percent did we predict?

In [12]:
def performance(result_x, result_y):
    
    actual_positive = result_y == 1
    actual_negative = result_y ==0
    
    true_positives = result_x[actual_positive] == 1
    false_positives = result_x[actual_negative] == 1
    true_negatives = result_x[actual_negative] == 0
    false_negatives = result_x[actual_positive] == 0
    
    #A. Accuracy = (TP + TN)/(TP + TN + FP + FN)
    #B. Precision = TP/(TP + FP)
    #C. Recall = TP/(TP + FN)
    accuracy = sum((result_x == result_y))/len(result_y)
    precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
    recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
    return [accuracy, precision, recall, len(result_x)]

In [13]:
performance_on_train = performance(result_x=trained_model[0].predict(trained_model[1]),result_y=trained_model[2])
performance_on_test = performance(result_x=predicted_test[0],result_y=predicted_test[1])
performance_on_external = performance(result_x=predicted_external[0],result_y=predicted_external[1])



In [14]:
results_out = pd.DataFrame({
    'Train':performance_on_train,
    'Test':performance_on_test,
    'external':performance_on_external

}).set_index(
    [['Accuracy','Precision','Recall','N Size'],
     ['% Spam / Ham Correct','% Predicted Spam Actually Spam','% Spam Detected','']])
results_out

Unnamed: 0,Unnamed: 1,Test,Train,external
Accuracy,% Spam / Ham Correct,0.975734,0.983294,1.0
Precision,% Predicted Spam Actually Spam,0.962617,0.873705,1.0
Recall,% Spam Detected,0.872881,0.873705,1.0
N Size,,1566.0,2993.0,6.0


#### 8. Confidence Interval for estimates

Question: What kind of accuracy, precision, and recall do we expect to see in future samples given that our text messages are acquired and do not differ from the 5000 text messages utilized here? 

Approach: Using resampling, resample 100 text messages 10,000 times. Use these to build a sampling distribution for Accuracy, Precision, and Recall that we will see with 90% liklihood in future cases. 



The two functions below:
    a. Samples 100 text messages and sorts them into the TFIDF, then predicts class membership
    b. Iterates the first point 10,000 in order to plot, 

In [15]:
def sample_predictions(sample_n):    
    predict_i = predict_unseen(new_doc_in=test.sample(sample_n), stop_words_in = stopwords_set2, trained_model_in = trained_model)
    return performance(result_x=predict_i[0],result_y=predict_i[1]) 

    
def sample_predict_distributions(sample_n, iters):
    stats = [sample_predictions(sample_n) for i in range(iters)]
    return stats

sample_predictions_iterated = sample_predict_distributions(100,10000)

In [16]:
sampled_results_df = pd.DataFrame(sample_predictions_iterated)
sampled_results_df.columns = ['Accuracy','Precision','Recall','N-Size']
sampled_results_df = sampled_results_df

def ci_generate(measure):
    sampled_results_df[measure].plot(kind='hist')
    low,high, estimate = sampled_results_df[measure].quantile(.05),sampled_results_df[measure].quantile(.95),sampled_results_df[measure].mean()
    low_line = plt.plot([low, low], [0, 2500], 'k-', lw=1)
    high_line = plt.plot([high, high], [0, 2500], 'k-', lw=1)
    estimate_line = plt.plot([estimate,estimate],[0,2500],'k-',lw=1)
    return low_line, high_line, print('the 90% CI for ',measure,':',estimate,' is between ',low, ' and ', high,'. Where the measurement error is ',round(estimate-low,3))
    

##### Results
For the three metrics
- Accuracy has a range of .97 to 1
- Recall has a range of .66  to 1
- Precision has a range of .83 to 1

In [17]:
_ = ci_generate('Accuracy')

the 90% CI for  Accuracy : 0.9758883057557632  is between  0.95  and  1.0 . Where the measurement error is  0.026


In [18]:
_ = ci_generate('Recall')

the 90% CI for  Recall : 0.8739089633673349  is between  0.7272727272727273  and  1.0 . Where the measurement error is  0.147


In [19]:
_ = ci_generate('Precision')

the 90% CI for  Precision : 0.9627479321803593  is between  0.8636363636363636  and  1.0 . Where the measurement error is  0.099
