In [10]:
import re

In [11]:
# CONSTANTS
NAMES_FILE = "spambase.names"
CHARS_FOR_REGEX = [';', '\(', '\[', '!', '\$', '#']

In [12]:
# extracts feature names from the 'names' file. takes address, returns list 
def feature_name_extractor(file):
    names = []
    with open(file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        names.extend(re.findall('word_freq_([a-z0-9]+):', line))
    return names
    
feature_names = feature_name_extractor(NAMES_FILE)

In [13]:
# extract features anc characters from the file. takes in a file address, list of feature names and list of chars. returns dict
def extract_fature(file, f_names, chars):
    words = []
    feature_freq = {i:0 for i in f_names}

    with open(file, 'r') as f:
        whole_txt = f.read()
        words.extend(re.findall('\w+', whole_txt.lower()))
    n_words = len(words)                            # number of all words
    n_chars = len(re.findall('[^ ]', whole_txt))    # number of all characters except for whitespace
    print(f"Total number of: words={n_words}    characters={n_chars}")

    for item in feature_freq.keys():
        count = words.count(item)
        feature_freq[item] = (count / n_words) * 100

    for char in chars:
        count = len(re.findall(char, whole_txt))
        feature_freq['char_freq_' + char[-1]] = (count / n_chars) * 100   # -1 is to remove the \ that is needed for regex. selects only last character

    capitals = re.findall('[A-Z]+', whole_txt)
    feature_freq['capital_run_length_average'] = sum([len(c) for c in capitals]) / len(capitals)
    feature_freq['capital_run_length_longest'] = max([len(c) for c in capitals])
    feature_freq['capital_run_length_total'] = sum([len(c) for c in capitals])

    return feature_freq

# prints dictionary with costum number of keys in one row with tab and float rounding
def print_n_by_n(my_dict, n):
    for i, key in enumerate(my_dict.keys()):
        print(f'{key}={my_dict[key]:.3f}', end='\t')
        if i % n == n-1:
            print('')


In [14]:
# First mail
first_mail = extract_fature('spam_or_no_spam.txt', feature_names, CHARS_FOR_REGEX)
print_n_by_n(first_mail, 6)

Total number of: words=223    characters=1362
make=0.000	address=0.000	all=0.000	3d=0.000	our=0.000	over=0.000	
remove=0.000	internet=0.000	order=0.000	mail=0.000	receive=0.000	will=0.897	
people=0.000	report=0.000	addresses=0.000	free=0.000	business=0.000	email=0.000	
you=0.448	credit=0.000	your=0.448	font=0.000	000=0.000	money=0.000	
hp=0.000	hpl=0.000	george=0.000	650=0.000	lab=0.000	labs=0.000	
telnet=0.000	857=0.000	data=0.000	415=0.000	85=0.000	technology=0.000	
1999=0.000	parts=0.000	pm=0.000	direct=0.000	cs=0.000	meeting=0.000	
original=0.000	project=0.000	re=0.000	edu=0.000	table=0.000	conference=0.000	
char_freq_;=0.000	char_freq_(=0.220	char_freq_[=0.220	char_freq_!=0.073	char_freq_$=0.000	char_freq_#=0.073	
capital_run_length_average=1.167	capital_run_length_longest=2.000	capital_run_length_total=56.000	

In [15]:
# Second mail
second_mail = extract_fature('spam_or_no_spam_2.txt', feature_names, CHARS_FOR_REGEX)
print_n_by_n(second_mail, 6)

Total number of: words=71    characters=369
make=0.000	address=0.000	all=0.000	3d=0.000	our=0.000	over=0.000	
remove=0.000	internet=0.000	order=0.000	mail=1.408	receive=0.000	will=0.000	
people=0.000	report=0.000	addresses=0.000	free=0.000	business=0.000	email=4.225	
you=1.408	credit=0.000	your=1.408	font=0.000	000=0.000	money=0.000	
hp=0.000	hpl=0.000	george=0.000	650=0.000	lab=0.000	labs=0.000	
telnet=0.000	857=0.000	data=0.000	415=0.000	85=0.000	technology=0.000	
1999=0.000	parts=0.000	pm=0.000	direct=0.000	cs=0.000	meeting=0.000	
original=0.000	project=0.000	re=1.408	edu=0.000	table=0.000	conference=0.000	
char_freq_;=0.000	char_freq_(=0.000	char_freq_[=0.000	char_freq_!=0.000	char_freq_$=0.000	char_freq_#=0.000	
capital_run_length_average=1.364	capital_run_length_longest=5.000	capital_run_length_total=45.000	

In [16]:
# Third mail
third_mail = extract_fature('spam_or_no_spam_3.txt', feature_names, CHARS_FOR_REGEX)
print_n_by_n(third_mail, 6)

Total number of: words=131    characters=711
make=0.000	address=0.000	all=0.000	3d=0.000	our=0.763	over=0.000	
remove=0.000	internet=0.000	order=0.000	mail=0.000	receive=0.000	will=0.000	
people=0.000	report=0.000	addresses=0.000	free=0.000	business=3.053	email=0.000	
you=1.527	credit=0.000	your=1.527	font=0.000	000=0.000	money=0.000	
hp=0.000	hpl=0.000	george=0.000	650=0.000	lab=0.000	labs=0.000	
telnet=0.000	857=0.000	data=0.000	415=0.000	85=0.000	technology=0.000	
1999=0.000	parts=0.000	pm=0.000	direct=0.000	cs=0.000	meeting=0.000	
original=0.000	project=1.527	re=0.000	edu=0.000	table=0.000	conference=0.000	
char_freq_;=0.141	char_freq_(=0.141	char_freq_[=0.000	char_freq_!=0.000	char_freq_$=0.000	char_freq_#=0.000	
capital_run_length_average=1.657	capital_run_length_longest=10.000	capital_run_length_total=58.000	