<a href="https://colab.research.google.com/github/delicate99/Python_ML/blob/main/BayesClassifier_trainng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Call Training and Test data set

In [1]:
!wget --output-document=SpamData.zip "https://github.com/delicate99/Python_ML/blob/main/SpamData_new.zip?raw=true"

--2021-01-22 01:37:01--  https://github.com/delicate99/Python_ML/blob/main/SpamData_new.zip?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/delicate99/Python_ML/raw/main/SpamData_new.zip [following]
--2021-01-22 01:37:01--  https://github.com/delicate99/Python_ML/raw/main/SpamData_new.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/delicate99/Python_ML/main/SpamData_new.zip [following]
--2021-01-22 01:37:01--  https://raw.githubusercontent.com/delicate99/Python_ML/main/SpamData_new.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting resp

In [2]:
import zipfile
from os import walk
from os.path import join
import os.path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [3]:
fname = "SpamData.zip"
if os.path.isfile(fname):
  with zipfile.ZipFile(fname,'r') as zip_file:
    zip_file.extractall('')
  print('Extraction Finished....')

Extraction Finished....


In [4]:
% cd 'SpamData'

/content/SpamData


In [5]:
!ls

01_Processing  02_Training  03_Testing


In [6]:
TRAINING_DATA_FILE='./02_Training/train-data.txt'
TEST_DATA_FILE='./02_Training/test-data.txt'

VOCAB_SIZE =2500

## Lead and load features from .txt files to Numpy array 

In [7]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter =' ', dtype =int)

In [8]:
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter =' ', dtype =int)

In [9]:
print('Nr of rows in the train file is ', sparse_train_data.shape[0])
print('Nr of rows in the test file is ', sparse_test_data.shape[0])

Nr of rows in the train file is  265427
Nr of rows in the test file is  110522


In [10]:
print('Nr of emails in the train file is ', np.unique(sparse_train_data[:, 0]).size)
print('Nr of  emails in the test file is ', np.unique(sparse_test_data[:, 0]).size) 

Nr of emails in the train file is  4015
Nr of  emails in the test file is  1724


### How to create empty Dataframe

In [11]:
column_names =['DOC_ID']+['CATEGORY']+list(range(0, VOCAB_SIZE))
column_names[: 5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [12]:
len(column_names)

2502

In [13]:
index_names =np.unique(sparse_train_data[:,0])
index_names

array([   0,    1,    2, ..., 5791, 5794, 5795])

In [14]:
full_train_data = pd.DataFrame(index =index_names, columns = column_names)
full_train_data.fillna(value =0, inplace =True)

In [15]:
full_train_data.shape

(4015, 2502)

### Create a full matrix from sparse matrix

In [16]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx =3):
    """
    Form a full  matrix from  Sparse matrix. Retun a pandas DataFrame
    key arguments
    sprase_matrix --- numpy array
    nr_words - size of vocabulary. Total numbers of tokens
    doc_idx  -position of the document in the sparse matrix (default :1st column)
    word_idx - position of the word in the sparse matrix (default : 2nd column)
    cat_idx  - position of the label in the sparse matrix (default ; 3rd column)
    freq_idx - position of the asccurence of words(default:4th column)
    """

    column_names =['DOC_ID']+['CATEGORY']+list(range(0, VOCAB_SIZE))
    doc_id_names =np.unique(sparse_train_data[:,0])
    full_matrix = pd.DataFrame(index =index_names, columns = column_names)
    full_matrix.fillna(value =0, inplace =True)


    for i in range(sparse_matrix.shape[0]):
        doc_nr= sparse_matrix [i][doc_idx]
        word_id = sparse_matrix [i][word_idx]
        label = sparse_matrix [i][cat_idx]
        occurence = sparse_matrix [i][freq_idx]

        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id]= occurence

    full_matrix.set_index('DOC_ID', inplace= True)
    return full_matrix


In [17]:
%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


## Training the Naive Bayes Model

### Calculating the Pobability of sapm

In [18]:
prob_spam = len(full_train_data[full_train_data.CATEGORY==1])/len(full_train_data['CATEGORY'])
prob_spam

0.31133250311332505

In [19]:
#prob_spam= full_train_data.CATEGORY.sum()/full_train_data.CATEGORY.size


### Total number of words/Count Tokens

In [20]:
full_train_features = full_train_data.loc[:, full_train_data.columns !='CATEGORY']


In [21]:
email_lengths = full_train_features.sum(axis=1)
email_lengths.shape

(4015,)

In [22]:
total_wc = email_lengths.sum()
total_wc

445891

## Number of tokens in spam & ham mails

In [23]:
spam_lengths = email_lengths[full_train_data.CATEGORY == 1]
spam_lengths

DOC_ID
0        50
1        76
2        87
3        76
4       136
       ... 
1885    135
1887     33
1889     65
1890     75
1895     22
Length: 1250, dtype: int64

In [24]:
spam_wc = spam_lengths.sum()
spam_wc

195645

In [25]:
ham_lengths = email_lengths[full_train_data.CATEGORY == 0]
ham_wc = ham_lengths.sum()
ham_wc


250246

In [26]:
nonspam_wc = ham_lengths.sum()
nonspam_wc

250246

In [27]:
spam_wc + nonspam_wc - total_wc

0

In [28]:
print('Average of Nr of in spam mail {:.0f}'.format(spam_wc/spam_lengths.shape[0]))
print('Average of Nr of in nonspam mail {:.3f}'.format(nonspam_wc/ham_lengths.shape[0]))

Average of Nr of in spam mail 157
Average of Nr of in nonspam mail 90.505


## Summing the Tokens coouring in Spam and Ham

In [29]:
full_train_features.shape

(4015, 2500)

In [30]:
train_spam_tokens = full_train_features[full_train_data.CATEGORY==1]
train_spam_tokens.shape

(1250, 2500)

In [31]:
summed_spam_tokens =train_spam_tokens.sum(axis=0)+1
summed_spam_tokens.shape

(2500,)

In [32]:
train_ham_tokens = full_train_features[full_train_data.CATEGORY==0]

In [33]:
summed_ham_tokens = train_ham_tokens.sum(axis=0)+1
summed_ham_tokens.shape

(2500,)

###P(Token|Spam) : Conditional Prob.





In [34]:
prob_tokens_spam = summed_spam_tokens / (spam_wc + VOCAB_SIZE)
prob_tokens_spam

0       0.009049
1       0.004885
2       0.006838
3       0.010593
4       0.006803
          ...   
2495    0.000096
2496    0.000010
2497    0.000116
2498    0.000050
2499    0.000020
Length: 2500, dtype: float64

###P(Token|Ham) : Conditional Prob.

In [35]:
prob_tokens_ham = summed_ham_tokens / (nonspam_wc + VOCAB_SIZE)
prob_tokens_ham

0       0.020863
1       0.009887
2       0.008063
3       0.003735
4       0.006311
          ...   
2495    0.000051
2496    0.000075
2497    0.000004
2498    0.000063
2499    0.000138
Length: 2500, dtype: float64

### P(Token) - probability that token accurs

In [36]:
#prob_tokens_all = (summed_spam_tokens +summed_ham_tokens)/(nonspam_wc + VOCAB_SIZE)


In [37]:
prob_tokens_all = full_train_features.sum(axis=0)/(nonspam_wc + VOCAB_SIZE)
prob_tokens_all

0       0.027949
1       0.013709
2       0.013417
3       0.012032
4       0.011636
          ...   
2495    0.000119
2496    0.000075
2497    0.000087
2498    0.000095
2499    0.000146
Length: 2500, dtype: float64

In [38]:
! ls

01_Processing  02_Training  03_Testing


In [49]:
# path 
TOKEN_SPAM_BROB_FILE = './03_Testing/prob-spam.txt'
TOKEN_HAM_BROB_FILE = './03_Testing/prob-ham.txt'
TOKEN_ALL_BROB_FILE = './03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = './03_Testing/test-features.txt'
TEST_TARGET_FILE = './03_Testing/test-target.txt'

In [40]:
np.savetxt(TOKEN_SPAM_BROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_BROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_BROB_FILE, prob_tokens_all)

## Prepare test Data

In [42]:
sparse_test_data.shape

(110522, 4)

In [43]:
%%time
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE)

CPU times: user 1min 2s, sys: 1.28 s, total: 1min 3s
Wall time: 1min 3s


In [46]:
X_test= full_test_data.loc[:, full_test_data.columns !='CATEGORY']
y_test =full_test_data.CATEGORY

In [50]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)