<a href="https://colab.research.google.com/github/firmanmaulana123/eda-boston-housing/blob/main/TUGAS_5_PraktikumAlgoritma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk wordcloud
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')

from google.colab import files
uploaded = files.upload()  # upload 'spam.csv' dan 'words.csv'

mails = pd.read_csv('spam.csv', encoding='latin-1')
mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
mails.rename(columns={'v1': 'spam', 'v2': 'message'}, inplace=True)
mails['spam'] = mails['spam'].map({'ham': False, 'spam': True})

all_mails_count = mails.shape[0]
split_idx = int(all_mails_count * 0.7)

train_data = mails.iloc[:split_idx]
test_data = mails.iloc[split_idx:].reset_index(drop=True)

def count_words(data):
    counter = collections.OrderedDict()
    for message in data:
        for word in message.split():
            counter[word] = counter.get(word, 0) + 1
    return counter

spam_messages = set(train_data[train_data['spam'] == True]['message'])
ham_messages = set(train_data[train_data['spam'] == False]['message'])

spam_words = count_words(spam_messages)
ham_words = count_words(ham_messages)

def bar_chart_words(words, top=10, messages_type="", color="#1f77b4"):
    top_items = sorted(words.items(), key=lambda x: -x[1])[:top]
    top_words = [item[0] for item in reversed(top_items)]
    top_counts = [item[1] for item in reversed(top_items)]

    plt.figure(figsize=(10, 6))
    plt.barh(top_words, top_counts, color=color)
    plt.xlabel("Jumlah Kata")
    plt.title(f"Top {top} Kata Terbanyak pada Pesan {messages_type}")
    plt.show()

bar_chart_words(spam_words, top=15, messages_type="Spam", color="orange")
bar_chart_words(ham_words, top=15, messages_type="Ham", color="green")

words_df = pd.read_csv('words.csv', encoding='utf-8', header=None)
words_df.columns = ['words']
wordlist = set(words_df['words'])
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def process_message(message):
    words = word_tokenize(message.lower())
    words = [w for w in words if len(w) > 1 and w not in stop_words and w in wordlist]
    return [stemmer.stem(w) for w in words]

def count_processed_words(data):
    counter = collections.OrderedDict()
    for message in data:
        for word in set(process_message(message)):
            counter[word] = counter.get(word, 0) + 1
    return counter

spam_words = count_processed_words(spam_messages)
ham_words = count_processed_words(ham_messages)

def show_wordcloud(word_freq, title=''):
    wc = WordCloud(width=800, height=600, max_words=15).generate_from_frequencies(word_freq)
    plt.figure(figsize=(8, 6), facecolor='k')
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.show()

show_wordcloud(spam_words, "Spam Words")
show_wordcloud(ham_words, "Ham Words")

all_messages = list(spam_messages) + list(ham_messages)
all_words = count_processed_words(all_messages)

def spam(message, s=1, p=0.5, percentage=False):
    n = 0
    for word in process_message(message):
        if word not in all_words:
            continue
        spam_freq = spam_words.get(word, 0) / all_words[word]
        ham_freq = ham_words.get(word, 0) / all_words[word]

        if (spam_freq + ham_freq) == 0:
            continue

        spaminess = spam_freq / (spam_freq + ham_freq)
        corr_spaminess = (s * p + all_words[word] * spaminess) / (s + all_words[word])
        n += np.log(1 - corr_spaminess) - np.log(corr_spaminess)

    prob = 1 / (1 + np.e ** n)
    if percentage:
        return f"Probabilitas spam: {prob * 100:.2f}%"
    return prob > 0.5

def test(spam_test, ham_test, s=1, p=0.5):
    tp = sum(1 for m in spam_test if spam(m, s, p))
    fn = len(spam_test) - tp
    fp = sum(1 for m in ham_test if spam(m, s, p))
    tn = len(ham_test) - fp

    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    fscore = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    acc = (recall + (tn / (tn + fp))) / 2

    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {fscore:.2%}")
    print(f"Balanced Accuracy: {acc:.2%}")

test_spam = [m for m in test_data[test_data['spam'] == True]['message'] if process_message(m)]
test_ham = [m for m in test_data[test_data['spam'] == False]['message'] if process_message(m)]
test(test_spam, test_ham)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
