In [32]:
import numpy as np
import reader

In [33]:
# List of stopwords
stopwords = set(["a","about","above","after","again","against","all","am","an","and","any","are","aren","'t","as","at","be","because","been","before","being","below","between","both","but","by","can","cannot","could","couldn","did","didn","do","does","doesn","doing","don","down","during","each","few","for","from","further","had","hadn","has","hasn","have","haven","having","he","he","'d","he","'ll","he","'s","her","here","here","hers","herself","him","himself","his","how","how","i","'m","'ve","if","in","into","is","isn","it","its","itself","let","'s","me","more","most","mustn","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan","she","she","'d","she","ll","she","should","shouldn","so","some","such","than","that","that","the","their","theirs","them","themselves","then","there","there","these","they","they","they","they","'re","they","this","those","through","to","too","under","until","up","very","was","wasn","we","we","we","we","we","'ve","were","weren","what","what","when","when","where","where","which","while","who","who","whom","why","why","with","won","would","wouldn","you","your","yours","yourself","yourselves"])


## Read in  spam and ham data

In [34]:
def load_data(trainingdir, stemming=False, lowercase=False, silently=False):
    print(f"Stemming is {stemming}")
    print(f"Lowercase is {lowercase}")
    train_set, train_labels = reader.load_dataset_main(trainingdir,stemming,lowercase,silently)
    return train_set, train_labels

In [35]:
training_dir = "data/enron2"

In [36]:
train_set, train_labels = load_data(training_dir)

Stemming is False
Lowercase is False


In [37]:
len(train_set)
len(train_labels)

5857

## Use TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = []
for lst in train_set:
    corpus.append(' '.join(lst))


In [39]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
tf_idf_score = vectorizer.fit_transform(corpus)
print(tf_idf_score.shape)



(5857, 39970)


In [40]:
from sklearn.preprocessing import normalize

normalized_matrix = normalize(tf_idf_score, norm='l1', axis=1)

print(normalized_matrix.shape)

(5857, 39970)


In [41]:
# Stores all th enormalized tf-idf scores
normalized_tf_idf = np.sum(normalized_matrix, axis=0)

print(normalized_tf_idf.shape)

(1, 39970)


In [42]:
words = vectorizer.get_feature_names_out()
print(words.shape)

(39970,)


## Save words and tf-idf in json

In [63]:
import json

# Step 1: Convert lists to a dictionary

word_freq_dict = {words[i]:normalized_tf_idf[0,i] for i in range(len(words))}

print(word_freq_dict)
# Step 2: Save the dictionary to a JSON file
output_file_path = "word_tf-idf.json"

with open(output_file_path, "w") as json_file:
    json.dump(word_freq_dict, json_file)

print(f"The dictionary has been saved to {output_file_path}")

The dictionary has been saved to word_tf-idf.json


## Select the top N words with highest tf-idf scores

In [68]:
# Combine words and frequencies into a list of tuples
word_frequency_pairs = [(words[i],normalized_tf_idf[0,i]) for i in range(len(words))]

# Sort the list based on frequencies in descending order
sorted_word_frequency_pairs = sorted(word_frequency_pairs, key=lambda x: x[1], reverse=True)

# Set the value of N (top N words)
N = 200

# Extract the top N words and their frequencies
top_N_words = sorted_word_frequency_pairs[:N]

print(top_N_words[:20])

[('ect', 35.94399790144519), ('enron', 30.92573588844003), ('vince', 30.8748316648683), ('subject', 29.31466400160525), ('will', 20.731149371597464), ('hou', 18.01394101543068), ('please', 17.93445033506608), ('com', 17.431156098233238), ('kaminski', 16.53622061424596), ('2000', 16.22272182366876), ('re', 15.556103610695455), ('thanks', 13.969840502557894), ('research', 12.027757943258537), ('pm', 12.010420542115096), ('cc', 11.773726232181282), ('know', 11.315214911518169), ('shirley', 11.265758166001616), ('2001', 10.908095469407687), ('group', 10.12175561547176), ('time', 10.116420744736072)]


## Count the frequency of each word

In [51]:
ham_dict = {}
spam_dict = {}

# ham is 1; sam is 0;
for i in range(len(train_set)):
    for word in words:
        if word in train_set[i]:
            if train_labels[i] == 1:
                if not word in ham_dict.keys():
                    ham_dict[word] = 0
                ham_dict[word] += 1
            else:
                if not word in spam_dict.keys():
                    spam_dict[word] = 0
                spam_dict[word] += 1

    if i % 100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800


In [69]:
output_file_path = "ham_freq.json"

with open(output_file_path, "w") as json_file:
    json.dump(ham_dict, json_file)

print(f"Ham dictionary has been saved to {output_file_path}")


output_file_path = "spam_freq.json"

with open(output_file_path, "w") as json_file:
    json.dump(spam_dict, json_file)

print(f"Spam dictionary has been saved to {output_file_path}")

Ham dictionary has been saved to ham_freq.json
Spam dictionary has been saved to spam_freq.json
