**Programming Exercise 1**

In [1]:
import json
import os
import re

In [2]:
class FeatureExtractor:

    def __init__(self):

        self.word_frequency = {
            'make': 0.0,
            'address': 0.0,
            'all': 0.0,
            '3d': 0.0,
            'our': 0.0,
            'over': 0.0,
            'remove': 0.0,
            'internet': 0.0,
            'order': 0.0,
            'mail': 0.0,
            'receive': 0.0,
            'will': 0.0,
            'people': 0.0,
            'report': 0.0,
            'addresses': 0.0,
            'free': 0.0,
            'business': 0.0,
            'email': 0.0,
            'you': 0.0,
            'credit': 0.0,
            'your': 0.0,
            'font': 0.0,
            '000': 0.0,
            'money': 0.0,
            'hp': 0.0,
            'hpl': 0.0,
            'george': 0.0,
            '650': 0.0,
            'lab': 0.0,
            'labs': 0.0,
            'telnet': 0.0,
            '857': 0.0,
            'data': 0.0,
            '415': 0.0,
            '85': 0.0,
            'technology': 0.0,
            '1999': 0.0,
            'parts': 0.0,
            'pm': 0.0,
            'direct': 0.0,
            'cs': 0.0,
            'meeting': 0.0,
            'original': 0.0,
            'project': 0.0,
            're': 0.0,
            'edu': 0.0,
            'table': 0.0,
            'conference': 0.0
        }

        self.character_frequency = {
            ';': 0.0,
            '(': 0.0,
            '[': 0.0,
            '!': 0.0,
            '$': 0.0,
            '#': 0.0
        }

        self.capital_length_average = 0
        self.capital_length_longest = 0
        self.capital_length_total = 0
        self.text = ''

    def load(self, path):
        self.__init__()
        
        with open(path, 'r') as email:
            # Read and strip whitespace from email text
            text = email.read().strip().splitlines()
            self.text = ' '.join(text)

    def extract(self):
        # Calculate character frequencies
        for ch in self.text:
            if ch in self.character_frequency.keys():
                self.character_frequency[ch] += 1
        
        # Remove non-alphanumeric and non-space characters and replace them with spaces
        clean_text = ''.join([ch if ch.isalnum() or ch == ' ' else ' ' for ch in self.text])
        
        # Keep track of the number of sequences of capital characters
        capital_sequence_count = 0
        
        # Remove consecutive spaces from text
        words = re.split(r'[ ]+', clean_text)
        
        for word in words:
            # Calculate the number of frequencies for each feature word
            if word in self.word_frequency:
                self.word_frequency[word] += 1
            
            # Keep track of the sequence of capitals
            is_capital_sequence = False
            capital_sequence_run_length = 0

            for ch in word + ' ':
                # When the new character is capital
                if ch.isupper():
                    # Check if previous character was also capital
                    if is_capital_sequence:
                        # Update the length of the current capital sequence
                        capital_sequence_run_length += 1
                    else:
                        # Start a new capital sequence
                        is_capital_sequence = True
                        capital_sequence_run_length = 1
                else:
                    # Check if the previous character was capital
                    if is_capital_sequence:
                        # End the capital sequence and save the result of the sequence
                        is_capital_sequence = False
                        capital_sequence_count += 1

                        self.capital_length_longest = max(capital_sequence_run_length, self.capital_length_longest)
                        self.capital_length_total += capital_sequence_run_length

                        capital_sequence_run_length = 0
        
        # Update the average length of the capital if there are capital sequences in the email
        if capital_sequence_count > 0:
            self.capital_length_average = self.capital_length_total / capital_sequence_count
            
        # Combine the results and calculate frequencies
        new_word_map = dict([('word_freq_' + item[0], (item[1] / len(words)) * 100) for item in self.word_frequency.items()])
        new_char_map = dict([('char_freq_' + item[0], (item[1] / len(self.text)) * 100) for item in self.character_frequency.items()])

        new_word_map.update(new_char_map)
        new_word_map["capital_run_length_average"] = self.capital_length_average
        new_word_map["capital_run_length_longest"] = self.capital_length_longest
        new_word_map["capital_run_length_total"] = self.capital_length_total
        
        return new_word_map

In [3]:
featureExtractor = FeatureExtractor()

emails = os.listdir("emails/")
for i, email in enumerate(emails):
    print(f'Email #{i + 1}: {email}')
    featureExtractor.load("./emails/" + email)
    print(json.dumps(featureExtractor.extract(), indent=2))

Email #1: spam_or_no_spam.txt
{
  "word_freq_make": 0.0,
  "word_freq_address": 0.0,
  "word_freq_all": 0.0,
  "word_freq_3d": 0.0,
  "word_freq_our": 0.0,
  "word_freq_over": 0.0,
  "word_freq_remove": 0.0,
  "word_freq_internet": 0.0,
  "word_freq_order": 0.0,
  "word_freq_mail": 0.0,
  "word_freq_receive": 0.0,
  "word_freq_will": 0.8849557522123894,
  "word_freq_people": 0.0,
  "word_freq_report": 0.0,
  "word_freq_addresses": 0.0,
  "word_freq_free": 0.0,
  "word_freq_business": 0.0,
  "word_freq_email": 0.0,
  "word_freq_you": 0.0,
  "word_freq_credit": 0.0,
  "word_freq_your": 0.4424778761061947,
  "word_freq_font": 0.0,
  "word_freq_000": 0.0,
  "word_freq_money": 0.0,
  "word_freq_hp": 0.0,
  "word_freq_hpl": 0.0,
  "word_freq_george": 0.0,
  "word_freq_650": 0.0,
  "word_freq_lab": 0.0,
  "word_freq_labs": 0.0,
  "word_freq_telnet": 0.0,
  "word_freq_857": 0.0,
  "word_freq_data": 0.0,
  "word_freq_415": 0.0,
  "word_freq_85": 0.0,
  "word_freq_technology": 0.0,
  "word_freq_