In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import email as em
import langdetect
from elasticsearch import Elasticsearch, helpers
import kibana
from pprint import pprint
import json
import random
from tqdm.notebook import tqdm
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
from IPython.display import Markdown as md

## Objective
Build a Spam Classifier using Machine Learning and ElasticSearch.

## Data

Consider the trec07_spam set of documents annotated for spam, available “data resources”.
First read and accept agreement at http://plg.uwaterloo.ca/~gvcormac/treccorpus07/. Then download the 255 MB Corpus (trec07p.tgz). The html data is in data/; the labels ("spam" or "ham") are in full/.  

You have to think a bit about storage (recommended ElasticSearch, but not required). Definitely use library to clean the html into plain text before storage. You dont have to do stemming or skipping stopwords (up to you); eliminating some punctuation might be useful.

Cleaning Data is Required: By "unigram" we mean an English word, so as part of reading/processing data there will be a filter step to remove anything that doesnt look like an English word or small number. Some mistake unigrams passing the filter are acceptable, if they look like words (e.x. "artist_", "newyork", "grande") as long as they are not overwhelming the set of valid unigrams. 

You can use any library/script/package for cleaning, or share your cleaning code (but only the cleaning code) with the other students. 

Make sure to have a field “label” with values “yes” or “no” (or "spam"/"ham") for each document.  

Partition the spam data set into TRAIN 80% and TEST 20%. One easy way to do so is to add to each document in ES a field "split" with values either "train" or "test" randomly, following the 80%-20% rule. 

Thus there will be 2 feature matrices, one for training and one for testing (different documents, same exact columns/features). The spam/ham distribution is roughly a third ham and two thirds spam; you should have a similar distribution in both TRAIN and TEST sets.

In [2]:
# Build spam dictionary

spam = {}

with open('/Users/gk/Desktop/Northeastern/MSDS/CS6200_Information_Retrieval/HW7/trec07p/full/index') as f:
    text = f.readlines()
    for line in text:
        line_list = line.split('/')
        spam_ham = line_list[0][:-3]
        e_mail = line_list[2].rstrip('\n')
        
        spam[e_mail] = spam_ham

In [3]:
def get_clean_body(message):
    
    ctype = message.get_content_type()
    cdispo = str(message.get('Content-Disposition'))

    if ctype == 'text/plain' and 'attachment' not in cdispo:
        try:
            body = message.get_payload()
        except:
            body = message.get_payload().decode()

        return ' '.join(body.split())

In [4]:
directory = '/Users/gk/Desktop/Northeastern/MSDS/CS6200_Information_Retrieval/HW7/trec07p/data'
random.seed(43)

def docs_to_index():

        """Program to parse email files and, for each email, provide a format for Elasticsearch 
    indexing of some basic email data"""
    
        for filename in tqdm(os.listdir(directory), desc = "files"):
            file_path = os.path.join(directory, filename)
            with open(file_path, encoding="ISO-8859-1") as f:

                msg = em.message_from_file(f)            

                if langdetect.detect(str(msg)) != 'en': # Skip if not English
                    continue

                subject = msg['Subject']
                msg_from = msg['From']
                msg_to = msg['To']
                spam_ham = spam[filename]

                if msg.is_multipart():
                    for part in msg.walk():
                        clean_body = get_clean_body(part)
                else:
                    clean_body = get_clean_body(msg)

                # label the data for train and test
                split_tuple = ('train','test')
                train_test = random.choices(split_tuple, weights=(80, 20))

                yield {"_index": "sham","_id": filename,"content": clean_body, "msg_from": msg_from, "msg_to": msg_to, 
                       "subject": subject, "spam_ham": spam_ham, "split": train_test[0]}

In [6]:
# for output in docs_to_index():
#     print(output,"\n")

In [6]:
# Connect to Elasticsearch
ES_HOST = {"host": "localhost", "port": 9200}
es = Elasticsearch(hosts=[ES_HOST], verify_certs=True)
if not es.ping():
    raise ValueError("Connection failed")

In [7]:
# Delete existing Elasticsearch index, if exists, and Create an index
# request_body = {
#     "settings" : {
#         "number_of_shards": 1,
#         "number_of_replicas": 1,
#         "analysis": {
#             "filter": {
#                 "english_stop": {
#                     "type": "stop",
#                     "stopwords_path": "my_stoplist.txt"
#                 }
#             },
#             "analyzer": {
#                 "stopped": {
#                     "type": "custom",
#                     "tokenizer": "standard",
#                     "filter": [
#                         "lowercase",
#                         "english_stop",
#                         "porter_stem"
#                     ]
#                 }
#             }
#       }
#     },
#     "mappings": {
#         "properties": {
#             "content": {
#                 "type": "text",
#                 "fielddata": True,
#                 "analyzer": "stopped",
#                 "index_options": "positions"
#             },
#             "msg_from":{
#                 "type": "text"
#             },
#             "msg_to":{
#                 "type": "text"
#             },
#             "subject":{
#                 "type": "text",
#                 "fielddata": True,
#                 "analyzer": "stopped",
#                 "index_options": "positions"
#             },
#             "spam_ham":{
#                 "type": "text"
#             },
#             "split":{
#                 "type": "text"
#             }
#         }
#     }
# }

# if es.indices.exists('sham'):
#     es.indices.delete(index = 'sham')

# es.indices.create(index = 'sham', body = request_body)

In [8]:
# Parse emails, with docs_to_index(), and load them to the index

# helpers.bulk(es, docs_to_index())

In [9]:
email_list = [data['_id'] for data in helpers.scan(client = es, query = {"query": {"match_all": {}}}, index="sham")]

## Part1: Manual Spam Features

### Train a learning algorithm

The label, or outcome, or target are the spam annotation “yes” / “no” or you can replace that with 1/0.

Using the “train” queries static matrix, train a learner to compute a model relating labels to the features on TRAIN set. You can use a learning library like SciPy/NumPy, C4.5, Weka, LibLinear, SVM Light, etc. The easiest models are linear regression and decision trees.

### Test the spam model

Test the model on TEST set. You will have to create a testing data matrix with feature values in the same exact way as you created the training matrix: use ElasticSearch (or as approrpiate for your storage) to query for your features, use the scores are feature values.Remember that features have to be consistent across train and test data.

    Run the model to obtain scores
    Treat the scores as coming from an IR function, and rank the documents
    Display first few “spam” documents and visually inspect them. You should have these ready for demo. IMPORTANT : Since they are likely to be spam, if you display these in a browser, you should turn off javascript execution to protect your computer.

### Train/Test 3 Algorithms

    (1) decision tree-based
    (2) regression-based
    (3) Naive-Bayes


In [10]:
def get_term_freq(term,doc):
    
    body = {"fields" : ["content"],"offsets" : False,"positions" : True,"term_statistics" : True,\
            "field_statistics" : True}

    results = es.termvectors(index = "sham", body = body, id = doc)

    output = results['term_vectors']['content']['terms'][term]

    return output['term_freq']  #, output['doc_freq'], output['ttf']


def get_w_docs(term):
    
    search_body = {"query": {"match":{"content": term}},}
    
    return [item['_id'] for item in helpers.scan(es, index="sham", query=search_body, _source = False)]

In [11]:
def get_email_data(email_id):
    
    search_body = {"query": {"match":{"_id": email_id}},}

    result = helpers.scan(es, index="sham", query=search_body)

    for i in result:
        
        s = i['_source']

        return i['_id'], s['msg_from'], s['msg_to'], s['subject'],  s['content'] 

In [12]:
def get_tf_per_term(term_list):

    doc_data = {}

    for i in tqdm(term_list, desc = "terms"):
        for j in get_w_docs(i):
            try:
                tf = get_term_freq(i,j)
            except:
                tf = 0

            if j in doc_data:
                doc_data[j].append((i,tf))
            else:
                doc_data[j] = [(i,tf)]
    
    return doc_data

In [13]:
def build_feature_matrix(doc_dict, emails):
    
    feature_list = []
    spam_list = []

    for email in tqdm(emails, desc = "emails"):
        if email in doc_dict:
            feat_dict = {item[0] : item[1] for item in doc_dict[email]}
            spam_list.append(spam[email])
            feature_list.append(feat_dict)

    df = pd.DataFrame(feature_list, index = [x for x in emails if x in doc_dict]).fillna(0)
    df = df.assign(spam_ham=spam_list)
    
    return df

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score

def run_trials(x_tr, y_tr, x_tst, y_tst, num, model):

    if model == 'rand_forest':
        result = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=43)

    if model == 'log_reg':
        result = LogisticRegression(solver='lbfgs',class_weight = 'auto', max_iter=200, random_state=43)

    if model == 'naive_b':
        result = BernoulliNB()

    result_cross_val = cross_val_score(result, x_tr, y_tr, scoring='accuracy', cv=5).mean()
    result.fit(x_tr, y_tr)
    result.predict(x_tst)
    predictions = result.predict_proba(x_tst)
    score = result.score(x_tst,y_tst)
    
    rel_pred = [i[1] for i in predictions]
    doc_proba_list  = [(doc, rel_prob) for doc, rel_prob in zip(x_tst.index.tolist(), rel_pred)]
    output = sorted(doc_proba_list, key=lambda tup: tup[1], reverse = True)[:3]

    for i in output:
        email_id, email_from, email_to, email_subject, email_content = get_email_data(i[0])
        
        print(email_id, "\n", "From: ", email_from, "\n","To: ", email_to, "\n", "Subject: ",email_subject,
              "\n", email_content, "\n")
        
    print("Score: ", score)

## Trial A
Manually create a list of ngrams (unigrams, bigrams, trigrams, etc) that you think are related to spam. For example : “free” , “win”, “porn”, “click here”, etc. These will be the features (columns) of the data matrix. 

In [34]:
trial_a_list = ['direct', 'singles', 'babes', 'additional', 'income', 'boss', 'affordable', 'bargain',
               'fast','cash', 'bankruptcy', 'creditors', 'acceptance', 'accordingly', 'avoid', 'baldness', 
                'diagnostics', 'viagra', 'free','amazing', 'access', 'apply']

In [35]:
# doc_data_a = get_tf_per_term(trial_a_list)

In [36]:
# with open('doc_data_a.json', 'w', encoding='utf-8') as f:
#     json.dump(doc_data_a, f, ensure_ascii=False, indent=4)

In [37]:
with open('doc_data_a.json') as json_file:
    doc_data_a = json.load(json_file)

In [38]:
trial_a_df = build_feature_matrix(doc_data_a, email_list)

HBox(children=(IntProgress(value=0, description='emails', max=73588, style=ProgressStyle(description_width='in…




In [39]:
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(trial_a_df.drop('spam_ham',axis = 1),
                                                    trial_a_df['spam_ham'], test_size=0.2, random_state=43)

#### Random Forest

In [40]:
run_trials(X_train_a, y_train_a, X_test_a, y_test_a, 3, 'rand_forest')

inmail.49357 
 From:  "Malisa Longgood" <fulanistillbirth@sebraego.com.br> 
 To:  <theplg@flax9.uwaterloo.ca> 
 Subject:  Re: 
 http://planshow.hk Viagra 4 pills X 100 mg FREE for every order! Special packs: - Soma 30 pills X 350 mg $ 42.08- Viagra 10 pills x 100 mg + Cialis 10 pills x 20 mg $68.72 - Viagra 30 pills X 100 mg $ 88.5 Viagra Our price $1.86 Viagra Soft Tabs Our price $1.08 Cialis Soft Tabs Our price $3.93 Levitra Our price $2.71 Viagra Jelly Our price $2.31 Cialis Our price $2.69 Lipitor Our price $1.38 Ultram Our price $0.71 Soma Our price $0.42 Zocor Our price $0.95 Zoloft Our price $0.92 Prozac Our price $1.99 Herbal Phentermine Our price $0.83 Testosterone Our price $3.67 Celebrex Our price $0.75 http://planshow.hk 

inmail.32012 
 From:  "Francisco Bishop" <gcraig@1-call.com> 
 To:  <catchall@flax9.uwaterloo.ca> 
 Subject:  Re: 
 Glad to see you!Look at the assortment of our new virtual pharmacy store=20= and save upto 85%We have special offers for you: VIAGRA FOR AS

#### Logistic Regression

In [41]:
run_trials(X_train_a, y_train_a, X_test_a, y_test_a, 3, 'log_reg')

inmail.52647 
 From:  "StockUpTicks" <bounce-stockupticks-9828040G@lyris.stockupticks.com> 
 To:  <gnitpick@flax9.uwaterloo.ca> 
 Subject:  Expanding the Brand Naturally 
 [Stockupticks.com] [Image] Welcome to the Stockupticks.com Newsletter! Issue 471 June 06, 2007 [Image] [Image] [Image] [Image] [Image] Clearly Canadian Beverage Corporation [Image] (OTCBB: CCBEF) Dear StockUpTicks Reader: A recent acquisition by a leading beverage company of an organic baby food producer would seem at first glance an odd pairing. This may well have been the perspective of some investors when Clearly Canadian (OTCBB:CCBEF) announced that the company completed its acquisition of My Organic Baby, which has national distribution in Canada for its organic baby food products. But when you consider the market for both products, the health-conscious consumer, it appears that My Organic Baby might be the perfect extension of the Clearly Canadian brand, literally introducing consumers to the company in their i

#### Naive Bayes (Bernoulli)

In [42]:
run_trials(X_train_a, y_train_a, X_test_a, y_test_a, 3, 'naive_b')

inmail.69548 
 From:  K Carpenter <gpov@bluedolphin.com> 
 To:  soundtrackdeficient@flax9.uwaterloo.ca 
 Subject:  Digital cameras and camcorders 
 FDA approved on-line pharmacies.click here Chose your product and site below: Canadian pharmacy - Cialis Soft Tabs - $5.78, Viagra Professional - $4.07, Viagra Soft Tabs - $4.1, Cialis - $5.67, Generic Viagra - $3.5, Soma - $1.38, Human Growth Hormone - $43.37, Meridia - $3.32, Tramadol - $2.17, Levitra - $11.97. HerbalKing - Herbal pills for Penis enlargement. Techniques, procedures, videos and tutorials. Don't waste your money on ineffective and possibly dangerous pumps, exercises and surgeries. Anatrim - Are you ready for Summer? Use Anatrim, the most powerful fat loss blend available anywhere. Safe, fast, effective! 

inmail.19413 
 From:  Zachary Benson <walsh@golf67.net> 
 To:  Captains <captains@flax24.uwaterloo.ca> 
 Subject:  FDA approved on-line pharmacies 
 FDA approved on-line pharmacies.click here Chose your product and site be

## Trial B

Instead of using your unigrams, use the ones from this list; rerun the training and testing.

In [43]:
trial_b_list = ['free','spam','click','buy','clearance','shopper','order','earn','cash','extra','money','double',
               'collect','credit','check','affordable','fast','price','loans','profit','refinance','hidden','freedom',
               'chance','miracle','lose','home','remove','success','virus','malware','ad','subscribe','sales',
               'performance','viagra','valium','medicine','diagnostics','million','join','deal','unsolicited','trial',
               'prize','now','legal','bonus','limited','instant','luxury','legal','celebrity','only','compare','win',
               'viagra','$$$','$discount','click','here','meet','singles','incredible','deal','lose','weight','act',
               'now','100%','free','fast','cash','million','dollars','lower','interest','rate','visit','our',
               'website','no','credit','check']

In [44]:
# doc_data_b = get_tf_per_term(trial_b_list)

In [45]:
# with open('doc_data_b.json', 'w', encoding='utf-8') as f:
#     json.dump(doc_data_b, f, ensure_ascii=False, indent=4)

In [46]:
with open('doc_data_b.json') as json_file:
    doc_data_b = json.load(json_file)

In [47]:
trial_b_df = build_feature_matrix(doc_data_b, email_list)

HBox(children=(IntProgress(value=0, description='emails', max=73588, style=ProgressStyle(description_width='in…




In [48]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(trial_b_df.drop('spam_ham',axis = 1),
                                                    trial_b_df['spam_ham'], test_size=0.2, random_state=43)

#### Random Forest

In [49]:
run_trials(X_train_b, y_train_b, X_test_b, y_test_b, 3, 'rand_forest')

inmail.134 
 From:  "leonidas kai-yuen" <chris@ifg.com> 
 To:  <catchall@speedy.uwaterloo.ca> 
 Subject:  Rich Bonus 
 Online Casinos are known for giving generous welcome bonuses to their = players. But such a rich Bonus. You never received! 300% on your first deposit up to 300$!!! A real Royal Bonus!!! Royal VIP Casino offers you the latest generation software, and an = elegant gaming atmosphere. With over 100 Casino Games to choose from and = a support team always available you cannot ask for more. Come and Play at Royal VIP Casino! http://luoarods.com/sp/ 

inmail.64057 
 From:  "kevlyne kouma" <kevkouma@moon.expression.edu> 
 To:  None 
 Subject:  confidenciely  help me 
 FROM: KEVLYNE KOUMA ABIDJAN , cote d'ivoire . email/(kev_kou@yahoo.fr) Dear Friend, I am Mrs. KEVLYNE KOUMA from cote d'ivoire. I am a widow; I lost my husband a couple of months ago. My husband was the director of Cocoa exporting board until his death. He was assassinated last January by the rebels following the

#### Logistic Regression

In [50]:
run_trials(X_train_b, y_train_b, X_test_b, y_test_b, 3, 'log_reg')

inmail.13625 
 From:  "Mr. Clive Buthelezi" <info@imdlotto.com> 
 To:  None 
 Subject:  INVESTMENT READ PLEASE 
 Dear friend I make this appeal with the understanding that any facts or opinion expressed by me are giving in confidence and solely for your personal information and use. I am Mr. Clive Buthelezi, the manager of bill and exchange at the foreign remittance department where I work. In my department I discovered an abandoned sum of amount in few millions in American currency, (US Dollars) with account number 8-32902701722 that belongs to one of our foreign customers who died along with his wife and only daughter in a plane crash of Alaska Airlines Flight number 261 which crashed on 31 January 2000. You shall read more about the crash on visiting this site http://www.cnn.com/2000/US/02/01/alaska.airlines.list/ The owner of this account is Mr. Morris Thompson an American and great in industrialist and a resident of Alaska. It is therefore upon this discovery that I now decided to

#### Naive Bayes (Bernoulli)

In [52]:
run_trials(X_train_b, y_train_b, X_test_b, y_test_b, 3, 'naive_b')

inmail.24368 
 From:  "Catalina Singleton" <c_singleton_uq@superonline.com> 
 To:  adtrevor@plg2.math.uwaterloo.ca, adtrevors@plg2.math.uwaterloo.ca,
   the00@plg2.math.uwaterloo.ca 
 Subject:  Cheap Prices for Meds. Viagra,Cialis,Xanax,Valium,Ambien    7xa5w 
 Dear adtrevor@plg2.math.uwaterloo.ca http://leturri.com Our New offshore pharmacy is open. Fast Worldwide delivery Anonymouse shipping Low prices http://leturri.com Thanks Rebeca Anniston adtrevor@plg2.math.uwaterloo.ca wrote: > Cheap Prices for Meds. Viagra,Cialis,Xanax,Valium,Ambien 1pwurw15l6- 

inmail.1723 
 From:  "Autobytel Newsletter" <Autobytel@enews.autobytel.com> 
 To:  <ktwarwic@speedy.uwaterloo.ca> 
 Subject:  Your April automotive updates from Autobytel 
 Dear warwick@plg.uwaterloo.ca http://dedekkio.com Our New offshore pharmacy is open. Fast Worldwide delivery Anonymouse shipping Low prices http://dedekkio.com Thanks Jennifer Cassidy warwick@plg.uwaterloo.ca wrote: > Cheap Prices for Meds. Viagra,Cialis,Xanax,Vali

## Part 2: All unigrams as features

A feature matrix should contain a column/feature for every unigram extracted from training documents. You will have to use a particular data format described in class (note, toy example), since the full matrix becomes too big. Write the matrix and auxiliary files on disk. 

Given the requirements on data cleaning, you should not have too many unigrams, but still many enough to have to use a sparse representation.

### Extracting all unigrams using Elastic Search calls

This is no diffeernt than part1 in terms of the ES calls, but you'd have to first generate a list with all unigrams.
If you dont use ES, this can be a tricky step, but there are python (poor) or java (better) libraries to extract all unigrams from all docs. Keep in mind that extracting all ngrams (say up to n=5) is a difficult problem at scale.

In [53]:
# part2_doc_data = {}
# part2_spam_data = {}

# body = {"fields" : ["content"],"offsets" : False,"positions" : False,"term_statistics" : True,\
#             "field_statistics" : True}

# for i in tqdm(email_list, desc = "emails"):

#     try:
#         result = es.termvectors(index = "sham", body = body, id = i)


#         term_list = [(key, value['term_freq']) for key,
#                      value in result['term_vectors']['content']['terms'].items() if key.isalpha()]
        
#         if spam[i] == 'spam': # Make spam/ham binary 0/1
#             spam_res = 0
#         else:
#             spam_res = 1
        
#         part2_spam_data[i] = spam_res

#         part2_doc_data[i] = term_list
        
#     except:
#         continue
        

In [54]:
# with open('part2_doc_data.json', 'w', encoding='utf-8') as f:
#     json.dump(part2_doc_data, f, ensure_ascii=False, indent=4)
    
# with open('part2_spam_data.json', 'w', encoding='utf-8') as f:
#     json.dump(part2_spam_data, f, ensure_ascii=False, indent=4)

In [55]:
with open('part2_doc_data.json') as json_file:
    part2_doc_data = json.load(json_file)
    
with open('part2_spam_data.json') as json_file:
    part2_spam_data = json.load(json_file)

### Build a set of all features from ES and a numerical index of features for the Compact Sparse Row matrix

In [56]:
feature_set = set()
feature_index = {}

# get unique features from part2_doc_data (add list items to a set)
for key,value in tqdm(part2_doc_data.items()):
    for item in value:
        feature_set.add(item[0])

# add each feature to a dictionary and provide an index number for it
f_index = 0
for i in feature_set:
    feature_index[i] = f_index
    f_index += 1

HBox(children=(IntProgress(value=0, max=69921), HTML(value='')))




### Build sparse matrix

In [81]:
from scipy.sparse import csr_matrix

data = []
indices = []
indptr = []

prev_idx = 0
total = 0
for key, value in tqdm(part2_doc_data.items()):
    for item in value:
        data.append(item[1])
        indices.append(feature_index[item[0]]) # get numerical index for term
    indptr.append(prev_idx)
    prev_idx += len(value)
    total += len(value)
indptr.append(total) # sum of values
                  
data = np.array(data)
indices = np.array(indices)
indptr = np.array(indptr)

part2_sparse = csr_matrix((data, indices, indptr), shape=(len(part2_doc_data), len(feature_set)))

HBox(children=(IntProgress(value=0, max=69921), HTML(value='')))




### Build "sparse" spam matrix (y values)

In [82]:
spam_list = list(part2_spam_data.values())
spam_array = np.array(spam_list)

### Train / Test Split on CSR data

In [83]:
# Added a "split" field to ES, but forgot to use it
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(part2_sparse, spam_array, test_size=0.2, stratify = None, random_state=43)

### Data Format
In Pyramid, each dataset is stored as a folder with two files in it, feature_matrix.txt and config.txt. 

In [93]:
# feature_matrix.txt
# Label Feature:Value Feature:Value
# 1 0:4 2:8 3:2
# 0 1:3 3:5

# config.txt
# numDataPoints=7077
# numFeatures=49060
# missingValue=false
# numClasses=2

# with open("pyramid/train/feature_matrix.txt", "w+") as train_feat,\
# open("pyramid/test/feature_matrix.txt", "w+") as test_feat, open("pyramid/train/config.txt", "w+") as train_config,\
# open("pyramid/test/config.txt", "w+") as test_config:

#     numDataPoints_train = 0
#     numDataPoints_test = 0
#     numFeatures_train = 0
#     numFeatures_test = 0
    
#     train_test_list = [X_train_2, X_test_2]
    
#     count = 0
#     for i in train_test_list:
        
#         if count == 0:
#             numFeatures_train = i.shape[1]
#         else:
#             numFeatures_test = i.shape[1]
        
#         for j in range(i.shape[0]):
#             sparse_row_mat = i.getrow(j)
#             row_mat = sparse_row_mat.todense()
#             row_values = row_mat[np.nonzero(row_mat)].tolist()[0]

#             row_indicies = sparse_row_mat.indices
#             idx_val_tuples = list(zip(row_indicies, row_values))
#             pyramid_features = [str(j[0]) + ':' + str(j[1]) for j in idx_val_tuples]
#             spam_status = [str(spam_list[j])]
#             pyramid_row = ' '.join(spam_status + pyramid_features)
            
#             if count == 0: # Train
#                 numDataPoints_train += len(row_values)
                
#                 train_feat.write(pyramid_row + "\n")
                
#             else: # Test
#                 numDataPoints_test += len(row_values)
                                
#                 test_feat.write(pyramid_row + "\n")

#         count += 1
        
            
#     train_config_list = ['numDataPoints='+str(numDataPoints_train), "numFeatures="+str(numFeatures_train), 
#                          "missingValue=false", "numClasses=2"]
    
#     for train in train_config_list:
#         train_config.write(train + "\n")
        
#     test_config_list = ['numDataPoints='+str(numDataPoints_test), "numFeatures="+str(numFeatures_test), 
#                          "missingValue=false", "numClasses=2"]
    
#     for test in test_config_list:
#         test_config.write(test + "\n")

### Training and testing
Once the feature matrices are ready (one for training, the second for testing), run either LibLinear Regression (with sparse input)  or a learning algorithm implemented by us to take advantage of the sparse data representations.

In [94]:
rand_forest2 = RandomForestClassifier(n_estimators=10, max_depth=50, min_samples_split=2, random_state=43)

rand_forest2_cross_val = cross_val_score(rand_forest2, X_train_2, y_train_2, scoring='accuracy', cv=5).mean()
rand_forest2.fit(X_train_2, y_train_2)
rand_forest2.predict(X_test_2)
rand_forest2_predictions = rand_forest2.predict_proba(X_test_2)
rand_forest2_score = rand_forest2.score(X_test_2,y_test_2)

In [95]:
rand_forest2_score 

0.856417590275295

### Feature analysis
Identify from the training log/model the top (most important) spam unigrams. Do they match your manual spam features from part 1?

In [111]:
# The two lists from Part 1 combined, made distinct, and snowball stemmed
# from nltk.stem.snowball import SnowballStemmer
# sno = SnowballStemmer('english')

from nltk.stem import PorterStemmer
ps = PorterStemmer()

part1_list = [ps.stem(i) for i in list(set(trial_a_list + trial_b_list))]

#### This is a function to print the feature analysis of the sklearn random forest implementation

In [130]:
def feature_analysis(top_n, part_1_list):

    importances = list(rand_forest2.feature_importances_)
    features = list(feature_set)
    feature_importances = list(zip(features, importances))
    sorted_feature_importances = sorted(feature_importances, key=lambda tup: tup[1], reverse = True )[:top_n]
    
    feature_importantance_list = [i[0] for i in sorted_feature_importances]
        
    count = 0
    crossover_list = []

    for i in part1_list:
        if i in feature_importantance_list:
            crossover_list.append(i)
            count +=1
    
    crossover_percent = str(round((count / len(part1_list)) * 100,1))+'%'
    
    print("Top ", top_n, ", Crossover Percent: ", crossover_percent, crossover_list)  

In [131]:
feature_analysis(20, part1_list)

Top  20 , Crossover Percent:  1.1% ['price']


In [132]:
feature_analysis(100, part1_list)

Top  100 , Crossover Percent:  5.7% ['price', 'fast', 'click', 'unsolicit', 'subscrib']


In [133]:
feature_analysis(1000, part1_list)

Top  1000 , Crossover Percent:  36.8% ['join', 'win', 'free', 'order', 'ad', 'addit', 'celebr', 'appli', 'limit', 'price', 'dollar', 'check', 'home', 'credit', 'medicin', 'visit', 'fast', 'amaz', 'success', 'million', 'collect', 'weight', 'remov', 'singl', 'click', 'websit', 'legal', 'interest', 'chanc', 'unsolicit', 'access', 'subscrib']


In [134]:
feature_analysis(5000, part1_list)

Top  5000 , Crossover Percent:  72.4% ['trial', 'instant', 'accept', 'join', 'win', 'direct', 'lower', 'freedom', 'free', 'order', 'ad', 'lose', 'addit', 'rate', 'celebr', 'malwar', 'perform', 'appli', 'limit', 'extra', 'clearanc', 'act', 'incred', 'price', 'sale', 'refin', 'avoid', 'dollar', 'afford', 'check', 'home', 'credit', 'prize', 'incom', 'medicin', 'visit', 'fast', 'amaz', 'viagra', 'success', 'million', 'collect', 'viru', 'bonu', 'weight', 'luxuri', 'remov', 'singl', 'click', 'compar', 'cash', 'websit', 'meet', 'legal', 'interest', 'chanc', 'hidden', 'earn', 'unsolicit', 'access', 'deal', 'spam', 'subscrib']


In [135]:
feature_analysis(10000, part1_list)

Top  10000 , Crossover Percent:  80.5% ['trial', 'instant', 'accept', 'join', 'win', 'direct', 'lower', 'loan', 'profit', 'freedom', 'shopper', 'free', 'order', 'ad', 'lose', 'addit', 'rate', 'celebr', 'malwar', 'perform', 'appli', 'limit', 'diagnost', 'extra', 'clearanc', 'act', 'incred', 'price', 'sale', 'bald', 'refin', 'avoid', 'dollar', 'afford', 'check', 'home', 'credit', 'prize', 'bargain', 'incom', 'medicin', 'visit', 'fast', 'amaz', 'viagra', 'boss', 'success', 'million', 'collect', 'viru', 'bonu', 'weight', 'luxuri', 'remov', 'singl', 'click', 'compar', 'cash', 'websit', 'meet', 'legal', 'interest', 'chanc', 'hidden', 'earn', 'unsolicit', 'access', 'deal', 'spam', 'subscrib']
