## masukan library yang digunakan

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [105]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\telog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## load dataset

In [106]:
# --- Baca dua dataset ---
data1 = pd.read_csv('email_spam_indo.csv')
data2 = pd.read_csv('spam.csv')

# --- Samakan nama kolom agar bisa digabung otomatis ---
data1 = data1.rename(columns={'Kategori': 'Category', 'Pesan': 'Message'})
data2 = data2.rename(columns={'Category': 'Category', 'Message': 'Message'})

# --- Gabungkan dua dataset ---
data = pd.concat([data1, data2], ignore_index=True)

# --- Bersihkan nilai NaN ---
data['Message'] = data['Message'].fillna('')

print("Ukuran data:", data.shape)
print(data.head())

Ukuran data: (8208, 2)
  Category                                            Message
0     spam  Secara alami tak tertahankan identitas perusah...
1     spam  Fanny Gunslinger Perdagangan Saham adalah Merr...
2     spam  Rumah -rumah baru yang luar biasa menjadi muda...
3     spam  4 Permintaan Khusus Pencetakan Warna Informasi...
4     spam  Jangan punya uang, dapatkan CD perangkat lunak...


## Tetx Preprocessing

## Case Folding

In [107]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                                 # merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+', '', text)   # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text)              # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text)                 # menghapus tanda baca
    text = text.strip()
    return text

In [108]:
raw_sample = data['Message'].iloc[2]
case_folding = casefolding(raw_sample)

print("Raw data:", raw_sample)
print("Case Folding:", case_folding)

Raw data: Rumah -rumah baru yang luar biasa menjadi mudah saya ingin menunjukkan kepada Anda pemilik rumah ini, Anda telah disetujui untuk pinjaman rumah $ 454, 169 di A 3. 72 Tingkat Tetap. Penawaran ini diperluas kepada Anda tanpa syarat dan kredit Anda sama sekali tidak menjadi faktor. Untuk memanfaatkan kesempatan waktu terbatas ini, yang kami minta adalah Anda mengunjungi situs web kami dan melengkapi formulir persetujuan pos 1 menit, lihatlah untuk mendengar dari Anda, Dorcas Pittman
Case Folding: rumah rumah baru yang luar biasa menjadi mudah saya ingin menunjukkan kepada anda pemilik rumah ini anda telah disetujui untuk pinjaman rumah    di a   tingkat tetap penawaran ini diperluas kepada anda tanpa syarat dan kredit anda sama sekali tidak menjadi faktor untuk memanfaatkan kesempatan waktu terbatas ini yang kami minta adalah anda mengunjungi situs web kami dan melengkapi formulir persetujuan pos  menit lihatlah untuk mendengar dari anda dorcas pittman


## Word Normalization

In [109]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any()
    else word for word in text.split()
    ])

    text = str.lower(text)
    return text

In [110]:
# membandingkan before dan after word normalization

raw_data = data['Message'].iloc[696]
word_normal = text_normalize(case_folding)

print('Raw Data\t :', raw_data)
print('Word Normalize\t :', word_normal)

Raw Data	 : Kami menantang Anda untuk menemukan panggilan anuitas yang lebih baik hari ini untuk informasi lebih lanjut! - atau - Silakan isi formulir di bawah ini untuk informasi lebih lanjut nama: E - Mail: Telepon: Kota: Negara Bagian: * 5. 40 % untuk setoran $ 100.000 dan lebih tinggi, 5. Bunga 25 % untuk setoran berjumlah $ 25.000 - $ 99, 999. Kami tidak ingin siapa pun menerima surat kami yang tidak ingin menerimanya. Ini adalah komunikasi profesional yang dikirim ke profesional asuransi. Untuk dihapus dari milis ini, jangan membalas pesan ini. Sebaliknya, buka di sini: http: / / www. asuransiq. Pemberitahuan hukum com / optout
Word Normalize	 : rumah rumah baru yang luar biasa menjadi mudah saya ingin menunjukkan kepada anda pemilik rumah ini anda telah disetujui untuk pinjaman rumah di a tingkat tetap penawaran ini diperluas kepada anda tanpa syarat dan kredit anda sama sekali tidak menjadi faktor untuk memanfaatkan kesempatan waktu terbatas ini yang kami minta adalah anda meng

## filtering (Stopword Removal)

In [111]:
from nltk.corpus import stopwords

# Ambil stopwords untuk Bahasa Indonesia dan Bahasa Inggris
stopwords_ind = stopwords.words('indonesian')
stopwords_eng = stopwords.words('english')

# Gabungkan keduanya jadi satu set agar tidak ada duplikasi
stop_words = set(stopwords_ind + stopwords_eng)

print("Total stopwords gabungan:", len(stop_words))


Total stopwords gabungan: 955


In [112]:
# melihat daftar stopword dari nltk
stop_words

{'a',
 'about',
 'above',
 'ada',
 'adalah',
 'adanya',
 'adapun',
 'after',
 'again',
 'against',
 'agak',
 'agaknya',
 'agar',
 'ain',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'all',
 'am',
 'amat',
 'amatlah',
 'an',
 'and',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'any',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'are',
 'aren',
 "aren't",
 'artinya',
 'as',
 'asal',
 'asalkan',
 'at',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'be',
 'beberapa',
 'because',
 'been',
 'before',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'being',
 'bekerja',
 'belakang',
 'belakangan',
 'below',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 

In [113]:
# membuat fungsi stopword removal
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stop_words.update(more_stopword)  # ✅ cara yang benar

def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stop_words:
            clean_words.append(word)
    return " ".join(clean_words)


In [114]:
raw_sample = data['Message'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('Stopword Reomoval \t\t', stopword_removal)

Raw Data 		 : Kami menantang Anda untuk menemukan panggilan anuitas yang lebih baik hari ini untuk informasi lebih lanjut! - atau - Silakan isi formulir di bawah ini untuk informasi lebih lanjut nama: E - Mail: Telepon: Kota: Negara Bagian: * 5. 40 % untuk setoran $ 100.000 dan lebih tinggi, 5. Bunga 25 % untuk setoran berjumlah $ 25.000 - $ 99, 999. Kami tidak ingin siapa pun menerima surat kami yang tidak ingin menerimanya. Ini adalah komunikasi profesional yang dikirim ke profesional asuransi. Untuk dihapus dari milis ini, jangan membalas pesan ini. Sebaliknya, buka di sini: http: / / www. asuransiq. Pemberitahuan hukum com / optout
Case Folding 		 : kami menantang anda untuk menemukan panggilan anuitas yang lebih baik hari ini untuk informasi lebih lanjut  atau  silakan isi formulir di bawah ini untuk informasi lebih lanjut nama e  mail telepon kota negara bagian     untuk setoran   dan lebih tinggi  bunga   untuk setoran berjumlah       kami tidak ingin siapa pun menerima surat ka

## Stemming

In [115]:
!pip -q install sastrawi

In [116]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
from langdetect import detect

# Inisialisasi stemmer untuk Bahasa Indonesia dan Inggris
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()
stemmer_en = PorterStemmer()

def stemming_auto(text):
    try:
        # deteksi bahasa (id atau en)
        lang = detect(text)
    except:
        lang = 'unknown'
    
    if lang == 'id':
        return stemmer_id.stem(text)
    elif lang == 'en':
        words = text.split()
        stemmed_words = [stemmer_en.stem(word) for word in words]
        return " ".join(stemmed_words)
    else:
        return text  # tidak diubah jika bahasa tidak terdeteksi


In [117]:
raw_sample = data['Message'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming_auto(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t\t :', stopword_removal)
print('Stemming \t\t :', text_stemming)

Raw Data 		 : Kami menantang Anda untuk menemukan panggilan anuitas yang lebih baik hari ini untuk informasi lebih lanjut! - atau - Silakan isi formulir di bawah ini untuk informasi lebih lanjut nama: E - Mail: Telepon: Kota: Negara Bagian: * 5. 40 % untuk setoran $ 100.000 dan lebih tinggi, 5. Bunga 25 % untuk setoran berjumlah $ 25.000 - $ 99, 999. Kami tidak ingin siapa pun menerima surat kami yang tidak ingin menerimanya. Ini adalah komunikasi profesional yang dikirim ke profesional asuransi. Untuk dihapus dari milis ini, jangan membalas pesan ini. Sebaliknya, buka di sini: http: / / www. asuransiq. Pemberitahuan hukum com / optout
Case Folding 		 : kami menantang anda untuk menemukan panggilan anuitas yang lebih baik hari ini untuk informasi lebih lanjut  atau  silakan isi formulir di bawah ini untuk informasi lebih lanjut nama e  mail telepon kota negara bagian     untuk setoran   dan lebih tinggi  bunga   untuk setoran berjumlah       kami tidak ingin siapa pun menerima surat ka

## text preprocessing pipeline

In [118]:
# membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming_auto(text)
    return text

In [119]:
%%time
data['clean_teks']= data['Message'].apply(text_preprocessing_process)

CPU times: total: 15min 16s
Wall time: 15min 35s


In [120]:
data

Unnamed: 0,Category,Message,clean_teks
0,spam,Secara alami tak tertahankan identitas perusah...,alami tahan identitas usaha sulit usaha pasar ...
1,spam,Fanny Gunslinger Perdagangan Saham adalah Merr...,fanny gunslinger dagang saham merrill muzo col...
2,spam,Rumah -rumah baru yang luar biasa menjadi muda...,rumah rumah mudah milik rumah tuju pinjam ruma...
3,spam,4 Permintaan Khusus Pencetakan Warna Informasi...,minta khusus cetak warna informasi tambah klik...
4,spam,"Jangan punya uang, dapatkan CD perangkat lunak...",uang dapat cd perangkat lunak kompatibilitas p...
...,...,...,...
8203,spam,This is the 2nd time we have tried 2 contact u...,nd tri kontak pound prize claim easi telepon p...
8204,ham,Will ü b going to esplanade fr home?,ü bb going esplanade fr home
8205,ham,"Pity, * was in mood for that. So...any other s...",piti mood soani suggest
8206,ham,The guy did some bitching but I acted like i'd...,guy bitch act like id interest buy someth els ...


In [121]:
# simpan data yang sudah dipreprocessing ke dalam file csv
data.to_csv('clean_data.csv')

## Feature Engineering

In [122]:
# pisahkan kolom feature dan target
x = data['clean_teks']
y = data['Category'] 

In [123]:
x

0       alami tahan identitas usaha sulit usaha pasar ...
1       fanny gunslinger dagang saham merrill muzo col...
2       rumah rumah mudah milik rumah tuju pinjam ruma...
3       minta khusus cetak warna informasi tambah klik...
4       uang dapat cd perangkat lunak kompatibilitas p...
                              ...                        
8203    nd tri kontak pound prize claim easi telepon p...
8204                         ü bb going esplanade fr home
8205                              piti mood soani suggest
8206    guy bitch act like id interest buy someth els ...
8207                                       rofl true nama
Name: clean_teks, Length: 8208, dtype: object

In [124]:
y

0       spam
1       spam
2       spam
3       spam
4       spam
        ... 
8203    spam
8204     ham
8205     ham
8206     ham
8207     ham
Name: Category, Length: 8208, dtype: object

## Feature Extraction (TF-IDF dan N-Gram)

In [125]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [126]:
# menampilkan vocabulary dari tif-idf
vec_TF_IDF.vocabulary_

{'alami': 529,
 'tahan': 22222,
 'identitas': 9975,
 'usaha': 23937,
 'sulit': 21917,
 'pasar': 16685,
 'penuh': 16970,
 'suqestions': 22009,
 'informasi': 10272,
 'logo': 13012,
 'tarik': 22348,
 'statlonery': 21571,
 'gaya': 8237,
 'situs': 20847,
 'tugas': 23474,
 'mudah': 14891,
 'janji': 10819,
 'havinq': 9228,
 'mes': 14257,
 'iogo': 10537,
 'otomatis': 16306,
 'dunia': 6155,
 'ieader': 9987,
 'isguite': 10622,
 'ciear': 3858,
 'produk': 18047,
 'organisasi': 16217,
 'bisnis': 2358,
 'efektif': 6405,
 'tuju': 23478,
 'praktis': 17829,
 'upaya': 23867,
 'daftar': 4855,
 'manfaat': 13547,
 'kreativitas': 12119,
 'asli': 1294,
 'buat': 2877,
 'tangan': 22311,
 'khusus': 11688,
 'cermin': 3549,
 'citra': 3901,
 'khas': 11679,
 'nyaman': 15775,
 'alat': 536,
 'tulis': 23491,
 'sedia': 20078,
 'format': 7823,
 'sistem': 20825,
 'manajemen': 13526,
 'konten': 12021,
 'letsyou': 12702,
 'ubah': 23621,
 'struktur': 21765,
 'tepat': 22604,
 'draft': 6014,
 'kerja': 11599,
 'jangkau': 10815

In [127]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

25673


In [128]:
# melihat fitur apa saja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names_out())

['____' 'aa' 'aaa' ... 'zzn' 'zzzz' 'üll']


In [129]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,____,aa,aaa,aah,aal,aaliyah,aall,aaniye,aaooooright,aaron,...,zxghlajf,zyada,zyban,zyc,zygoma,zymg,zzmacmac,zzn,zzzz,üll
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,abklaeren,abl,able,abler,abn,abnorm,abnormal,abook,abormalitas,abouta
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

In [131]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [132]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduced features
print('Original Feature Number', x_train.shape[1])
print('Reduced feature Number', x_kbest_features.shape[1])

Original Feature Number 25673
Reduced feature Number 3000


In [133]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.188604
1,0.527398
2,0.067958
3,0.470213
4,0.184281
...,...
25668,1.650032
25669,0.077971
25670,0.257448
25671,2.645238


In [134]:
# menampilkan feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data

Unnamed: 0,Nilai,Fitur
0,0.188604,____
1,0.527398,aa
2,0.067958,aaa
3,0.470213,aah
4,0.184281,aal
...,...,...
25668,1.650032,zymg
25669,0.077971,zzmacmac
25670,0.257448,zzn
25671,2.645238,zzzz


In [135]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
20847,1.960114e+02,situs
11804,9.911761e+01,klik
13252,9.494826e+01,lunak
16991,8.765521e+01,perangkat
3916,7.735402e+01,claim
...,...,...
10002,1.335443e-06,ignor
1463,7.921044e-07,aus
12761,4.056779e-07,liberty
12203,8.099756e-08,kue


In [136]:
mask = chi2_features.get_support()
mask

array([False, False, False, ..., False,  True, False], shape=(25673,))

In [137]:
# menampilkan fitur yang terpilih berdsarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square

new_feature=[]
for bool, f in zip(mask, feature):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature

['ab',
 'abacha',
 'abadi',
 'abai',
 'abdulla',
 'abiola',
 'abl',
 'abt',
 'abta',
 'ac',
 'acadia',
 'access',
 'accessories',
 'account',
 'aclpm',
 'acoentri',
 'acrobat',
 'adab',
 'adclick',
 'addamsfa',
 'adel',
 'adil',
 'admir',
 'admirer',
 'adob',
 'adobe',
 'adobechoose',
 'adon',
 'adp',
 'adqueen',
 'ads',
 'adv',
 'advantaqes',
 'advis',
 'advisor',
 'ae',
 'aeopublishing',
 'aerofoam',
 'afiliasi',
 'afrika',
 'aft',
 'afternoon',
 'ag',
 'agama',
 'age',
 'agen',
 'agenda',
 'ago',
 'agrocom',
 'ah',
 'aha',
 'ahmad',
 'aicohoi',
 'aicohol',
 'aight',
 'aii',
 'aiia',
 'aiiti',
 'aiity',
 'ail',
 'air',
 'aja',
 'ajaib',
 'ajar',
 'aju',
 'akar',
 'akibat',
 'akreditasi',
 'akses',
 'aksesori',
 'aksi',
 'aktif',
 'aktivitas',
 'aku',
 'akuisisi',
 'akun',
 'al',
 'alamat',
 'alami',
 'alas',
 'alat',
 'album',
 'alcohoi',
 'alert',
 'alex',
 'alfi',
 'algarv',
 'ali',
 'alia',
 'alias',
 'aliceposta',
 'alkohol',
 'almarhum',
 'alreadi',
 'already',
 'alright',
 'alt

In [138]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi

new_selected_feature = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'alami': 529,
 'tahan': 22222,
 'identitas': 9975,
 'usaha': 23937,
 'sulit': 21917,
 'pasar': 16685,
 'penuh': 16970,
 'suqestions': 22009,
 'informasi': 10272,
 'logo': 13012,
 'tarik': 22348,
 'statlonery': 21571,
 'gaya': 8237,
 'situs': 20847,
 'tugas': 23474,
 'mudah': 14891,
 'janji': 10819,
 'havinq': 9228,
 'mes': 14257,
 'iogo': 10537,
 'otomatis': 16306,
 'dunia': 6155,
 'ieader': 9987,
 'isguite': 10622,
 'ciear': 3858,
 'produk': 18047,
 'organisasi': 16217,
 'bisnis': 2358,
 'efektif': 6405,
 'tuju': 23478,
 'praktis': 17829,
 'upaya': 23867,
 'daftar': 4855,
 'manfaat': 13547,
 'kreativitas': 12119,
 'asli': 1294,
 'buat': 2877,
 'tangan': 22311,
 'khusus': 11688,
 'cermin': 3549,
 'citra': 3901,
 'khas': 11679,
 'nyaman': 15775,
 'alat': 536,
 'tulis': 23491,
 'sedia': 20078,
 'format': 7823,
 'sistem': 20825,
 'konten': 12021,
 'letsyou': 12702,
 'ubah': 23621,
 'struktur': 21765,
 'tepat': 22604,
 'draft': 6014,
 'kerja': 11599,
 'jangkau': 10815,
 'istirahat': 10652

In [139]:
len(new_selected_feature)

3000

In [140]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

In [141]:
# menampilkan fitur-fitur yang sudah diseleksi

data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,ab,abacha,abadi,abai,abdulla,abiola,abl,abt,abta,ac,...,za,zed,zimbabwe,zimin,zip,zipmail,zona,zuma,zymg,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
selected_x =x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(8208, 3000))

In [143]:
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [144]:
x = selected_x
y = data.Category

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [145]:
print('Banyaknya X_train:', len(x_train))
print('Banyaknya X_test:', len(x_test))
print('Banyaknya Y_train:', len(y_train))
print('Banyaknya Y_test:', len(y_test))

Banyaknya X_train: 6566
Banyaknya X_test: 1642
Banyaknya Y_train: 6566
Banyaknya Y_test: 1642


In [146]:
text_algorithm=MultinomialNB()

In [147]:
model = text_algorithm.fit(x_train, y_train)

In [185]:
data_input = ("perangkat lunak grafi tersedia versi oem murah selamat pagi menawarkan paket oem terbaru grafik perangkat lunak publishinq corel macromedia adob adob photoshop cs macromedia studio mx adob acrobat profesion adob premier pro corel design quickbook profestiveai edit adob paqemak xara x vl audisi adob discreet studio maksim adob goiiv cs adob effect standar adob premier eiement corei painter ix adob lilustr cs adob indesign cs adob creativ suit adob framemak ulead cool studio produksi aiia motion buiider profesion quicken premier home biz adob photoshop eiement adob premier pro pelajari sinceeiy clemmi")
data_input = text_preprocessing_process(data_input)

tfidf = TfidfVectorizer
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav","rb"))))
hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil=='spam'):
    s = "Email Spam"
else:
    s = "Email Normal"

print("Hasil Prediksi : \n", s)



Hasil Prediksi : 
 Email Spam


In [149]:
from sklearn.metrics import classification_report, confusion_matrix

predicted = model.predict(x_test)
CM = confusion_matrix(y_test, predicted)
print(classification_report(y_test,predicted))

              precision    recall  f1-score   support

         ham       0.96      0.99      0.98      1229
        spam       0.97      0.88      0.93       413

    accuracy                           0.96      1642
   macro avg       0.97      0.94      0.95      1642
weighted avg       0.96      0.96      0.96      1642



In [150]:
pickle.dump(model, open("model_fraud.sav","wb"))