# IMPORT LIBRARY

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
# Find the path of the stopwords resource
stopwords_path = nltk.data.find('corpora/stopwords.zip')
print(f"The stopwords resource is located at: {stopwords_path}")

The stopwords resource is located at: /root/nltk_data/corpora/stopwords.zip


# LOADING DATA

In [8]:
import os
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/dataset_sms_spam_v1.csv')
print(data.shape)
data.head()

Mounted at /content/drive
(1143, 2)


Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


# EDA

In [9]:
# CLASS DISTRIBUTION
#if dataset is balanced or not
x=data['label'].value_counts()
x

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,569
1,335
2,239


In [10]:
#Missing values
data.isna().sum()

Unnamed: 0,0
Teks,0
label,0


# TEXT PREPROCESSING

## CASE FOLDING(TEXT CLEANING)

In [11]:
import re

# Membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                               # membuat kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text)           # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text)               # menghapus tanda baca dari kalimat
    text = re.sub(r'\s+', ' ', text)                  # menghapus spasi ganda dari kalimat
    text = text.strip()                               # menghapus spasi di awal dan akhir kalimat

    return text

In [12]:
data_sample = data['Teks'].iloc[3]
case_folding = casefolding(data_sample)
print('Test Sample\t : ', data_sample)
print('Case Folding\t : ', case_folding)

Test Sample	 :  2016-08-07 11:29:47.Plg Yth, sisa kuota Flash Anda 7160KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 :  plg yth sisa kuota flash anda kb download mytelkomsel apps di utk cek kuotabeli paket flash atau hub


## WORD NORMALIZATION

In [13]:
key_norm = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/key_norm.csv')
key_norm.head()

Unnamed: 0,_id,singkat,hasil
0,1,abis,habis
1,2,accent,tekanan
2,3,accept,terima
3,4,accident,kecelakaan
4,5,achievement,prestasi


In [14]:
def textnormalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any()
    else word for word in text.split()])

    text = str.lower(text)
    return text

In [15]:
data_sample = data['Teks'].iloc[696]
word_normal = textnormalize(data_sample)
print('Data Sample\t : ', data_sample)
print('Word Normalizing\t : ', word_normal)


Data Sample	 :  Btw magicomnya yg sedang Gais, gaada yg gede
Word Normalizing	 :  ngomong ngomong magicomnya yang sedang gais, tidak ada yang besar


## STOPWORD REMOVAL

In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopwords_indonesia = stopwords.words('indonesian')

In [17]:
len(stopwords_indonesia)

758

In [18]:
stopwords_indonesia

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [19]:
# menambahkan kata dalam stopword
more_stopword = ['tsel', 'telkomsel', 'xl', 'rb', 'btw']
stopwords_indonesia = stopwords_indonesia + more_stopword

# mebuat fungsi stopword removal
def stopwordremoval(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_indonesia:
            clean_words.append(word)
    return ' '.join(clean_words)

In [20]:
data_sample = data['Teks'].iloc[500]
case_folding = casefolding(data_sample)
word_norm = textnormalize(case_folding)
stopword_removal = stopwordremoval(word_norm)

print('Test Sample\t\t : ', data_sample)
print('Case Folding\t\t : ', case_folding)
print('Word Nomalisasi\t\t : ', word_norm)
print('Stopword Removal\t : ', stopword_removal)

Test Sample		 :  Slmt No Anda mendptkan Hadiah 1 unit MOBIL HONDA jazz dr promo TRI CARE 2016 dgn PIN PEMENANG:454647LM lnfo Klik:www.kejutanbimatri.blogspot.com
Case Folding		 :  slmt no anda mendptkan hadiah unit mobil honda jazz dr promo tri care dgn pin pemenanglm lnfo klik
Word Nomalisasi		 :  selamat nomor anda mendapatkan hadiah unit mobil honda jazz dari promo tri care dengan pin pemenanglm lnfo klik
Stopword Removal	 :  selamat nomor hadiah unit mobil honda jazz promo tri care pin pemenanglm lnfo klik


## STEMMING

In [21]:
!pip -q install sastrawi

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/209.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
# merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [23]:
# membuat function untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [24]:
data_sample = data['Teks'].iloc[696]
case_folding = casefolding(data_sample)
stopword_removal = stopwordremoval(case_folding)
stemming_text = stemming(stopword_removal)

print('Data Sample\t\t : ', data_sample)
print('Case Folding\t\t : ', case_folding)
print('Stopword Removal\t : ', stopword_removal)
print('Stemming\t\t : ', stemming_text)

Data Sample		 :  Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding		 :  btw magicomnya yg sedang gais gaada yg gede
Stopword Removal	 :  magicomnya yg gais gaada yg gede
Stemming		 :  magicomnya yg gais gaada yg gede


## TEST PREPROCESSING PIPELINE

In [25]:
# membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
  text = casefolding(text)
  text = textnormalize(text)
  text = stopwordremoval(text)
  text = stemming(text)
  return text

In [26]:
%%time
data['clean_teks'] = data['Teks'].apply(text_preprocessing_process)

CPU times: user 6min 46s, sys: 1.27 s, total: 6min 47s
Wall time: 7min 2s


In [27]:
data

Unnamed: 0,Teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash gb my app extra kuota g...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,gb rupiah ribu spesial pilih aktif promo sd no...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,gb rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [28]:
# simpan data yang sudah dipreprocessing ke dalam file csv
data.to_csv('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/clean_data.csv')

## FEATURE ENGINEERING

In [29]:
# Memisahkan kolom feature dan target

x = data['clean_teks']
y = data['label']

In [30]:
x

Unnamed: 0,clean_teks
0,promo beli paket flash gb my app extra kuota g...
1,gb rupiah ribu spesial pilih aktif promo sd no...
2,langgan hormat sisa kuota flash kb download my...
3,langgan hormat sisa kuota flash kb download my...
4,gb rupiah ribu spesial pilih aktif buru skb
...,...
1138,yooo oke umumin grup kelas
1139,nulis kerudung kirain warna jins
1140,mbak kirim ya
1141,nama beaok bwrangkat pagimau cas atay tranfer


In [31]:
y

Unnamed: 0,label
0,2
1,2
2,2
3,2
4,2
...,...
1138,0
1139,0
1140,0
1141,0


# FEATURE EXTRACTION (TF-IDF dan N-Gram)

In [32]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Uningram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_, open('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/feature_tf-idf.sav', 'wb'))

In [33]:
# menampilkan vocabulary dari tf-idf
vec_TF_IDF.vocabulary_

{'promo': 2296,
 'beli': 323,
 'paket': 2089,
 'flash': 870,
 'gb': 942,
 'my': 1881,
 'app': 162,
 'extra': 841,
 'kuota': 1550,
 'lte': 1653,
 'telpon': 2878,
 'mnthr': 1832,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3013,
 'sk': 2692,
 'rupiah': 2504,
 'ribu': 2453,
 'spesial': 2751,
 'pilih': 2176,
 'aktif': 66,
 'sd': 2558,
 'november': 1990,
 'langgan': 1578,
 'hormat': 1117,
 'sisa': 2685,
 'kb': 1411,
 'download': 752,
 'mytelkomsel': 1883,
 'apps': 167,
 'kuotabeli': 1551,
 'hubung': 1141,
 'skb': 2693,
 'ekstra': 804,
 'pulsa': 2333,
 'internet': 1221,
 'bulan': 466,
 'sjk': 2691,
 'augsept': 217,
 'detail': 665,
 'iring': 1243,
 'tarif': 2842,
 'panjang': 2102,
 'hits': 1106,
 'armada': 180,
 'curi': 600,
 'hati': 1070,
 'tekan': 2870,
 'okcall': 2042,
 'informasi': 1192,
 'eks': 801,
 'loh': 1640,
 'internetan': 1222,
 'pakai': 2087,
 'volume': 3129,
 'ultima': 3066,
 'mbhr': 1739,
 'harga': 1058,
 'tariflokasi': 2844,
 'tselmefl': 3011,
 'coboy': 568,
 'jr': 1330,
 'heba

In [34]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

3252


In [35]:
# melihat feature apa saja yang ada didalam corpus

print(vec_TF_IDF.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [36]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1, columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,akang,akangteteh,akbar,akreditasi,akses,aksi,aktif,aktifasi,aktivasi,aktivitas
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.149444,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.262305,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.244053,0.0,0.382416,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# FEATURE SELECTION

In [38]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [39]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k = 3000)
X_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduce feature
print('Original Feature Number', x_train.shape[1])
print('Reduced Feature Number', X_kbest_features.shape[1])

Original Feature Number 3252
Reduced Feature Number 3000


In [40]:
Data = pd.DataFrame(chi2_features.scores_, columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.835608
1,0.419698
2,1.558607
3,0.716455
4,0.800674
...,...
3247,1.180239
3248,0.503162
3249,0.716455
3250,2.918739


In [41]:
# menampilkan data feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature
Data['fitur'] = feature
Data

Unnamed: 0,Nilai,fitur
0,0.835608,aa
1,0.419698,aamiiiin
2,1.558607,aamiin
3,0.716455,ab
4,0.800674,abadi
...,...,...
3247,1.180239,zalora
3248,0.503162,zarkasi
3249,0.716455,zjt
3250,2.918739,zona


In [42]:
# mengurutkan nilai feature terbaik

Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,fitur
2089,47.917892,paket
942,47.483452,gb
1031,46.429890,hadiah
1550,45.216656,kuota
2179,39.383773,pin
...,...,...
1521,0.044714,kopi
307,0.044468,bca
1695,0.031579,maksimal
531,0.013294,cepat


In [43]:
# menampilkan mask pada feature yang akan diseleksi
mask = chi2_features.get_support()
mask


array([ True,  True,  True, ...,  True,  True,  True])

In [44]:
# menampilkan fitur yang terpilih berdasarkan nilai mask dan nilai tertinggi yang sudah ditetapkan pada chi square
new_feature = []
for bool, f in zip(mask, feature):
  if bool:
    new_feature.append(f)
  selected_feature = new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'account',
 'ada',
 'adapromo',
 'adi',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahub',
 'aidzin',
 'aigoo',
 'air',
 'aja',
 'ajaa',
 'ajaaa',
 'ajabri',
 'ajak',
 'ajeng',
 'akang',
 'akangteteh',
 'akbar',
 'akreditasi',
 'akses',
 'aksi',
 'aktif',
 'aktifasi',
 'aktivasi',
 'aktivitas',
 'akucintaislam',
 'akumulasi',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'aliando',
 'all',
 'allah',
 'allahaamiin',
 'alphard',
 'alquran',
 'alur',
 'aman',
 'amanda',
 'ambil',
 'amin',


In [45]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi

new_selected_feature = {}
for k, v in vec_TF_IDF.vocabulary_.items():
  if k in selected_feature:
    new_selected_feature[k] = v
new_selected_feature

{'promo': 2296,
 'beli': 323,
 'paket': 2089,
 'flash': 870,
 'gb': 942,
 'my': 1881,
 'app': 162,
 'extra': 841,
 'kuota': 1550,
 'lte': 1653,
 'telpon': 2878,
 'mnthr': 1832,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3013,
 'sk': 2692,
 'rupiah': 2504,
 'ribu': 2453,
 'spesial': 2751,
 'pilih': 2176,
 'aktif': 66,
 'sd': 2558,
 'november': 1990,
 'langgan': 1578,
 'hormat': 1117,
 'sisa': 2685,
 'kb': 1411,
 'download': 752,
 'mytelkomsel': 1883,
 'apps': 167,
 'kuotabeli': 1551,
 'hubung': 1141,
 'skb': 2693,
 'ekstra': 804,
 'pulsa': 2333,
 'internet': 1221,
 'bulan': 466,
 'sjk': 2691,
 'augsept': 217,
 'detail': 665,
 'iring': 1243,
 'tarif': 2842,
 'panjang': 2102,
 'hits': 1106,
 'armada': 180,
 'curi': 600,
 'hati': 1070,
 'tekan': 2870,
 'okcall': 2042,
 'informasi': 1192,
 'eks': 801,
 'loh': 1640,
 'internetan': 1222,
 'pakai': 2087,
 'volume': 3129,
 'ultima': 3066,
 'mbhr': 1739,
 'harga': 1058,
 'tariflokasi': 2844,
 'tselmefl': 3011,
 'coboy': 568,
 'jr': 1330,
 'baru

In [46]:
len(new_selected_feature)

3000

In [47]:
pickle.dump(new_selected_feature, open('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/new_selected_feature_tf-idf.sav', 'wb'))

In [48]:
# menampilkan fitur-fitur yang sudah diseleksi

data_selected_feature = pd.DataFrame(X_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# MODELING

In [49]:
selected_x = X_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [50]:
# import library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

In [51]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [52]:
# menampilkan jumlah data training dan data testing
print('Banyaknya X_train : ', len(x_train))
print('Banyaknya y_train : ', len(y_train))
print('Banyaknya X_test : ', len(x_test))
print('Banyaknya y_test : ', len(y_test))

Banyaknya X_train :  914
Banyaknya y_train :  914
Banyaknya X_test :  229
Banyaknya y_test :  229


In [53]:
# proses training menggunakan naive bayes

text_algorithm = MultinomialNB()

In [54]:
model = text_algorithm.fit(x_train, y_train)

In [59]:
# membuat model prediksi

# data_input = ("promo beli paket flash gb my app extra kuota gb lte extra telpon mnthr buru cek tselmemytsel sk") # Data Promo
# data_input = ("nikah muda gagal nikah pacar eh putus alesannya sayang nyakitin pret jagain jodoh orang ah wkwk") # Data Normal
data_input = ("selamat nomor care pilih hadiah unit mobil toyota yaris pin menang andajf uinfo cek") # Data Fraud
data_input = text_preprocessing_process(data_input)

# load model(TF-IDF Vecotrizer)
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/new_selected_feature_tf-idf.sav', 'rb'))))
hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil == 0):
  s = "SMS Normal"
elif(hasil==1):
  s = "SMS Fraud"
else:
  s = "SMS Promo"
print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Fraud


## EVALUASI MODEL

In [61]:
# masukkan library yang dibutuhkan untuk proses testing

from sklearn.metrics import confusion_matrix, classification_report

predicted = model.predict(x_test)
CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       126
           1       0.91      0.89      0.90        66
           2       0.85      0.89      0.87        37

    accuracy                           0.92       229
   macro avg       0.90      0.91      0.91       229
weighted avg       0.92      0.92      0.92       229



In [62]:
# menyimpan model

pickle.dump(model, open('/content/drive/MyDrive/Colab Notebooks/Tugas Akhir Praktisi Text Mining/model_fraud.sav', 'wb'))