In [5]:
"""
Origem dos dados: https://archive.ics.uci.edu/ml/datasets/Spambase

Attribute Information:

| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format)
|
| 48 continuous real [0,100] attributes of type word_freq_WORD 
| = percentage of words in the e-mail that match WORD,
| i.e. 100 * (number of times the WORD appears in the e-mail) / 
| total number of words in e-mail.  A "word" in this case is any 
| string of alphanumeric characters bounded by non-alphanumeric 
| characters or end-of-string.
|
| 6 continuous real [0,100] attributes of type char_freq_CHAR
| = percentage of characters in the e-mail that match CHAR,
| i.e. 100 * (number of CHAR occurences) / total characters in e-mail
|
| 1 continuous real [1,...] attribute of type capital_run_length_average
| = average length of uninterrupted sequences of capital letters
|
| 1 continuous integer [1,...] attribute of type capital_run_length_longest
| = length of longest uninterrupted sequence of capital letters
|
| 1 continuous integer [1,...] attribute of type capital_run_length_total
| = sum of length of uninterrupted sequences of capital letters
| = total number of capital letters in the e-mail
|
| 1 nominal {0,1} class attribute of type spam
| = denotes whether the e-mail was considered spam (1) or not (0), 
| i.e. unsolicited commercial e-mail.  
|
| For more information, see file 'spambase.DOCUMENTATION' at the
| UCI Machine Learning Repository: http://www.ics.uci.edu/~mlearn/MLRepository.html
"""
0

0

In [4]:
pwd

u'C:\\Users\\Lenovo\\Documents\\BigData\\machine learning algorithms'

## Download the data

In [6]:
# Downloading the datasets:
import requests as re
resp = re.get('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names')
with open('data/spambase.names', 'w') as f:
    f.write(resp.text)
    

data_header = ['word_freq_make',
    'word_freq_address',
    'word_freq_all',
    'word_freq_3d',
    'word_freq_our',
    'word_freq_over',
    'word_freq_remove',
    'word_freq_internet',
    'word_freq_order',
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people',
    'word_freq_report',
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',
    'word_freq_you',
    'word_freq_credit',
    'word_freq_your',
    'word_freq_font',
    'word_freq_000',
    'word_freq_money',
    'word_freq_hp',
    'word_freq_hpl',
    'word_freq_george',
    'word_freq_650',
    'word_freq_lab',
    'word_freq_labs',
    'word_freq_telnet',
    'word_freq_857',
    'word_freq_data',
    'word_freq_415',
    'word_freq_85',
    'word_freq_technology',
    'word_freq_1999',
    'word_freq_parts',
    'word_freq_pm',
    'word_freq_direct',
    'word_freq_cs',
    'word_freq_meeting',
    'word_freq_original',
    'word_freq_project',
    'word_freq_re',
    'word_freq_edu',
    'word_freq_table',
    'word_freq_conference',
    'char_freq_;',
    'char_freq_(',
    'char_freq_[',
    'char_freq_!',
    'char_freq_$',
    'char_freq_#',
    'capital_run_length_average',
    'capital_run_length_longest',
    'capital_run_length_total',
    'spam']
    
resp = re.get('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data')
with open('data/spambase.data', 'w') as f:
    f.write(','.join(data_header) + '\n')
    f.write(resp.text)

In [7]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-notebook')
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from IPython.display import display

## Read the data

In [8]:
df = pd.read_csv('data/spambase.data', sep=',')
df = df.reset_index().rename(columns={'index': 'id'})
df.info()
display(df.head())
display(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 59 columns):
id                            4601 non-null int64
word_freq_make                4601 non-null float64
word_freq_address             4601 non-null float64
word_freq_all                 4601 non-null float64
word_freq_3d                  4601 non-null float64
word_freq_our                 4601 non-null float64
word_freq_over                4601 non-null float64
word_freq_remove              4601 non-null float64
word_freq_internet            4601 non-null float64
word_freq_order               4601 non-null float64
word_freq_mail                4601 non-null float64
word_freq_receive             4601 non-null float64
word_freq_will                4601 non-null float64
word_freq_people              4601 non-null float64
word_freq_report              4601 non-null float64
word_freq_addresses           4601 non-null float64
word_freq_free                4601 non-null float64
word_freq_b

Unnamed: 0,id,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Unnamed: 0,id,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,2300.0,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,1328.338624,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,1150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,2300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,3450.0,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4600.0,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [None]:
# import the scatter_matrix functionality
from pandas.tools.plotting import scatter_matrix

# define colors list, to be used to plot survived either red (=0) or green (=1)
colors=['red','green']

# make a scatter plot
#scatter_matrix(df,figsize=[20,20],marker='o',c=df.spam.apply(lambda x:colors[x]))
df.hist()
plt.show()

## Select train/test dataset

In [11]:
split_ratio = 0.90
test_len = np.int(df.shape[0]*split_ratio)
train_len = df.shape[0] - test_len
df_train = df.sample(test_len)
print 'Training dataset size: %d' % df_train.shape[0]
df_test = df.sample(train_len)
print 'Test dataset size: %d' % df_test.shape[0]

Training dataset size: 4140
Test dataset size: 461


## Summarize data

1.Separate Data By Class

2.Calculate Probabilities for multinomial distribution

In [66]:
def mean(x):
    return np.mean(x/100)

df_summary = df_train.drop('id', axis=1).groupby('spam').agg([mean])
display(df_summary)

Unnamed: 0_level_0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.000724,0.00238,0.001961,1e-05,0.001814,0.000451,9.5e-05,0.000392,0.00039,0.001709,...,0.000479,0.00051,0.001587,0.000234,0.001009,0.000111,0.000233,0.023848,0.182069,1.597437
1,0.001523,0.001649,0.004004,0.001818,0.005094,0.001766,0.002674,0.002097,0.00168,0.003467,...,2e-05,0.000211,0.001104,7.7e-05,0.005202,0.001744,0.000799,0.096317,1.034542,4.616976


## Make Prediction

We are now ready to make predictions using the summaries prepared from our training data. Making predictions involves calculating the probability that a given data instance belongs to each class, then selecting the class with the largest probability as the prediction.

We can divide this part into the following tasks:

1.Calculate Multinomial Probability Mass Function

2.Calculate Class Probabilities

3.Make a Prediction

4.Estimate Accuracy

In [89]:
var_cols = df_summary.stack().columns.tolist()

cols = ['id', 'spam'] + df_test.columns[(df_test.columns.str.contains('word_'))  | (df_test.columns.str.contains('char_'))].tolist()
df_test_sample = df_test[cols]
display(df_test_sample.head())
df_test_sample = pd.melt(df_test_sample, id_vars=['id', 'spam'], value_vars=var_cols)
display(df_test_sample.head())

df_aux = df_summary.stack().reset_index()
df_aux.rename(columns={'level_1':'measures'}, inplace=True)
df_metled_summary = pd.melt(df_aux, id_vars=['spam', 'measures'], value_vars=var_cols)
df_metled_summary = df_metled_summary.pivot_table(values='value', index=['spam', 'variable'], columns=['measures'])
df_metled_summary = df_metled_summary.reset_index()
display(df_metled_summary.head())

df_test_sample = df_test_sample.merge(df_metled_summary, on=['variable'], how='left')
display(df_test_sample.head())


Unnamed: 0,id,spam,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,...,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#
4332,4332,0,0.26,0.0,0.26,0.0,0.52,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038,0.0,0.038
4054,4054,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.07,0.0,0.0,0.0,0.197,0.0,0.0,0.0,0.0
3195,3195,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.58,0.0,0.0,0.0,0.0,0.107,0.0,0.107,0.0,0.0
2598,2598,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.61,0.0,0.0,0.0,0.095,0.38,0.19,0.19,0.0,0.0
1588,1588,1,0.52,0.0,1.05,0.0,0.0,1.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.295,0.698,0.0


Unnamed: 0,id,spam,variable,value
0,4332,0,word_freq_make,0.26
1,4054,0,word_freq_make,0.0
2,3195,0,word_freq_make,0.0
3,2598,0,word_freq_make,0.0
4,1588,1,word_freq_make,0.52


measures,spam,variable,mean
0,0,capital_run_length_average,0.023848
1,0,capital_run_length_longest,0.182069
2,0,capital_run_length_total,1.597437
3,0,char_freq_!,0.001009
4,0,char_freq_#,0.000233


Unnamed: 0,id,spam_x,variable,value,spam_y,mean
0,4332,0,word_freq_make,0.26,0,0.000724
1,4332,0,word_freq_make,0.26,1,0.001523
2,4054,0,word_freq_make,0.0,0,0.000724
3,4054,0,word_freq_make,0.0,1,0.001523
4,3195,0,word_freq_make,0.0,0,0.000724


In [90]:
df_test_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52554 entries, 0 to 52553
Data columns (total 6 columns):
id          52554 non-null int64
spam_x      52554 non-null int64
variable    52554 non-null object
value       49788 non-null float64
spam_y      52554 non-null int64
mean        52554 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 2.8+ MB


In [91]:
spam_prob = {0: df[df.spam == 0].shape[0]/np.float(df.shape[0]),
             1: df[df.spam == 1].shape[0]/np.float(df.shape[0])}
print 'Spam probability: %.2f' % spam_prob[1] 
print 'Not Spam probability: %.2f' % spam_prob[0] 

Spam probability: 0.39
Not Spam probability: 0.61


In [92]:
import pdb

#df_test_sample['odds'] = df_test_sample.groupby(['id'])['log_prob'].transform(sum)
#display(df_test_sample.head())
#df_test_sample['odds'] = df_test_sample['log_prob']/df_test_sample['odds']
#display(df_test_sample.head())

def foo(df):
    #pdb.set_trace()
    df['log_prob'] = np.log10(spam_prob[df.spam_x.max()]) + (df['value']*np.log10(df['mean'])).sum()
    df['prob'] = (spam_prob[df.spam_x.max()])*((df['value']*(df['mean'])).product())
    return df

df_test_sample = df_test_sample.groupby(['id', 'spam_x', 'spam_y']).apply(foo)
display(df_test_sample.head())

Unnamed: 0,id,spam_x,variable,value,spam_y,mean,log_prob,prob
0,4332,0,word_freq_make,0.26,0,0.000724,-10.646883,0.0
1,4332,0,word_freq_make,0.26,1,0.001523,-9.661821,0.0
2,4054,0,word_freq_make,0.0,0,0.000724,-21.399211,0.0
3,4054,0,word_freq_make,0.0,1,0.001523,-27.546315,0.0
4,3195,0,word_freq_make,0.0,0,0.000724,-10.994313,0.0


In [93]:
df_test_sample = df_test_sample.groupby([
        'id', 'spam_x', 'spam_y'], as_index=False)[['log_prob']].max()
display(df_test_sample.head())

###############
#df_test_sample['odds'] = df_test_sample.groupby(['id'])['log_prob'].transform(sum)
#display(df_test_sample.head())
#df_test_sample['odds'] = df_test_sample['log_prob']/df_test_sample['odds']
#display(df_test_sample.head())
###############

idx = (df_test_sample.groupby(['id', 'spam_x'])['log_prob'].transform(max) == df_test_sample['log_prob'])
df_test_sample = df_test_sample[idx]
df_test_sample['right_prediction'] = (df_test_sample.spam_x - df_test_sample.spam_y == 0)
display(df_test_sample.head())

Unnamed: 0,id,spam_x,spam_y,log_prob
0,1,1,0,-31.922324
1,1,1,1,-26.667379
2,9,1,0,-16.73365
3,9,1,1,-14.022204
4,20,1,0,-4.629386


Unnamed: 0,id,spam_x,spam_y,log_prob,right_prediction
1,1,1,1,-26.667379,True
3,9,1,1,-14.022204,True
5,20,1,1,-4.225109,True
7,23,1,1,-16.994399,True
8,24,1,0,-6.348003,False


In [94]:
df_result = df_test_sample.groupby('right_prediction')[['id']].count()
display(df_result)
print 'Success rate: %.1f%%' % ((df_result.ix[True]/df_result.sum()).values[0]*100)

Unnamed: 0_level_0,id
right_prediction,Unnamed: 1_level_1
False,73
True,403


Success rate: 84.7%
