In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")



In [3]:
data = pd.read_csv('dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2



Preprocessing


In [4]:
#casefolding
import re

def casefolding(text):
    text = text.lower() # merubah bentuk menjadi lower case
    text = re.sub(r'\d+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]','',text) # menghapus karakter tanda baca
    text = text.strip()
    return text

In [5]:
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t :',raw_sample)
print('Case folding\t :',case_folding)


Raw data	 : 2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case folding	 : plg yth sisa kuota flash anda kb download mytelkomsel apps di httptselmetsel utk cek kuotabeli paket flash atau hub


In [6]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if (key_norm["singkat"] == word).any() else word for word in text.split()])
    text = str.lower(text)
    return text

In [7]:
raw_data = data['teks'].iloc[696]
word_normal = text_normalize(raw_data)

print('Raw data\t :',raw_data)
print('word normalize\t :',word_normal)

Raw data	 : Btw magicomnya yg sedang Gais, gaada yg gede
word normalize	 : btw magicomnya yang sedang gais, tidak ada yang besar


In [8]:
#filtering
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stop_factory = stopwords.words('indonesian')
more_stopword = ['daring', 'online', 'pd', 'btw']

In [10]:
stopwords_ind = stop_factory + more_stopword

In [11]:
def remove_stop_words(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)

In [12]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)

stop_removal = remove_stop_words(case_folding)

print('Raw data\t\t :',raw_data)
print('case folding\t\t :',case_folding)
print('stopword removal\t\t :',stop_removal)

Raw data		 : Btw magicomnya yg sedang Gais, gaada yg gede
case folding		 : btw magicomnya yg sedang gais gaada yg gede
stopword removal		 : magicomnya yg gais gaada yg gede


#stemming


In [13]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
    text = stemmer.stem(text)
    return text

In [14]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)

stop_removal = remove_stop_words(case_folding)
text_stem = stemming(stop_removal)

print('Raw data\t\t :',raw_data)
print('case folding\t\t :',case_folding)
print('stopword removal\t\t :',stop_removal)
print('text stemming\t\t :',text_stem)

Raw data		 : Btw magicomnya yg sedang Gais, gaada yg gede
case folding		 : btw magicomnya yg sedang gais gaada yg gede
stopword removal		 : magicomnya yg gais gaada yg gede
text stemming		 : magicomnya yg gais gaada yg gede


In [15]:
#text preprocessing pipeline
def data_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_words(text)
    text = stemming(text)
    return text

In [16]:
%%time
data['clean_teks'] = data['teks'].apply(data_process)

CPU times: total: 1min 41s
Wall time: 5min 14s


In [17]:
data

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash gb my telkomsel app ext...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,gb rupiah ribu spesial pilih aktif promo sd no...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,gb rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [18]:
#save data
data.to_csv('clean_data.csv', encoding='utf-8')

feature extraction


In [19]:

x = data['clean_teks']
y = data['label']

In [20]:
x

0       promo beli paket flash gb my telkomsel app ext...
1       gb rupiah ribu spesial pilih aktif promo sd no...
2       langgan hormat sisa kuota flash kb download my...
3       langgan hormat sisa kuota flash kb download my...
4             gb rupiah ribu spesial pilih aktif buru skb
                              ...                        
1138                           yooo oke umumin grup kelas
1139                     nulis kerudung kirain warna jins
1140                                        mbak kirim ya
1141        nama beaok bwrangkat pagimau cas atay tranfer
1142                                       nomor bri nama
Name: clean_teks, Length: 1143, dtype: object

In [21]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1138    0
1139    0
1140    0
1141    0
1142    0
Name: label, Length: 1143, dtype: int64

In [22]:
#save model
import pickle

#tf_idf
from sklearn.feature_extraction.text import TfidfVectorizer


#unigram
vec_tfidf = TfidfVectorizer(ngram_range=(1,1))
vec_tfidf.fit(x)

x_tf_idf = vec_tfidf.transform(x)

pickle.dump(vec_tfidf.vocabulary_,open("feature_tfidf.sav", "wb"))


In [23]:
#menampilkan vocabulary dari tf_idf
vec_tfidf.vocabulary_

{'promo': 2342,
 'beli': 323,
 'paket': 2136,
 'flash': 873,
 'gb': 945,
 'my': 1929,
 'telkomsel': 2923,
 'app': 162,
 'extra': 843,
 'kuota': 1598,
 'lte': 1701,
 'telpon': 2926,
 'mnthr': 1880,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3062,
 'sk': 2739,
 'rupiah': 2551,
 'ribu': 2500,
 'spesial': 2798,
 'pilih': 2222,
 'aktif': 66,
 'sd': 2605,
 'november': 2038,
 'langgan': 1626,
 'hormat': 1120,
 'sisa': 2732,
 'kb': 1446,
 'download': 754,
 'mytelkomsel': 1931,
 'apps': 167,
 'httptselmetsel': 1159,
 'kuotabeli': 1599,
 'hubung': 1172,
 'skb': 2740,
 'ekstra': 806,
 'pulsa': 2379,
 'rb': 2439,
 'internet': 1254,
 'bulan': 466,
 'sjk': 2738,
 'augsept': 217,
 'detail': 665,
 'iring': 1276,
 'tarif': 2889,
 'panjang': 2149,
 'hits': 1109,
 'armada': 180,
 'curi': 600,
 'hati': 1073,
 'tekan': 2917,
 'okcall': 2090,
 'informasi': 1224,
 'eks': 803,
 'loh': 1688,
 'internetan': 1255,
 'pakai': 2134,
 'volume': 3178,
 'ultima': 3115,
 'mbhr': 1787,
 'harga': 1061,
 'tariflokasi': 2

In [24]:
#melihat jumlah featrure
print(len(vec_tfidf.get_feature_names_out()))

3446


In [25]:
#melihat feture apa saja yg ada dalam corpus
print(vec_tfidf.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [26]:
x1 = vec_tfidf.transform(x).toarray()
data_tabular_tfidf = pd.DataFrame(x1, columns=vec_tfidf.get_feature_names_out())
data_tabular_tfidf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
data_tabular_tfidf.iloc[10:20,60:70]

Unnamed: 0,akang,akangteteh,akbar,akreditasi,akses,aksi,aktif,aktifasi,aktivasi,aktivitas
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.14944,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.262305,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.244053,0.0,0.382416,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
#feature selection
x_train = np.array(data_tabular_tfidf)
y_train = np.array(y)


In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k= 3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

#unruk reduced
print('original feature number', x_train.shape[1])
print('reduced feature number', x_kbest_features.shape[1])




original feature number 3446
reduced feature number 3000


In [30]:
data1 = pd.DataFrame(chi2_features.scores_, columns=['nilai'])
data1

Unnamed: 0,nilai
0,0.843018
1,0.419698
2,1.558607
3,0.686416
4,0.759870
...,...
3441,1.126664
3442,0.503012
3443,0.686416
3444,2.918687


In [31]:
#menampilkan data feature beserta nilai

feature = vec_tfidf.get_feature_names_out()

data1['Fitur'] = feature
data1

Unnamed: 0,nilai,Fitur
0,0.843018,aa
1,0.419698,aamiiiin
2,1.558607,aamiin
3,0.686416,ab
4,0.759870,abadi
...,...,...
3441,1.126664,zalora
3442,0.503012,zarkasi
3443,0.686416,zjt
3444,2.918687,zona


In [32]:
#mengurutkan nilai feature terbaik
data1.sort_values(by='nilai', ascending=False)

Unnamed: 0,nilai,Fitur
2136,47.432958,paket
945,46.945223,gb
1598,44.003212,kuota
1034,43.567734,hadiah
2225,36.928316,pin
...,...,...
1567,0.044714,kopi
307,0.044468,bca
1743,0.031575,maksimal
3170,0.010568,via


In [33]:
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [34]:
new_feature=[]
for bool, f in zip(mask, feature):
    if bool:
        new_feature.append(f)
    selected_feature = new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'account',
 'ada',
 'adapromo',
 'adi',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahub',
 'aigoo',
 'air',
 'aja',
 'ajaa',
 'ajaaa',
 'ajabri',
 'ajak',
 'ajeng',
 'akang',
 'akbar',
 'akreditasi',
 'akses',
 'aksi',
 'aktif',
 'aktifasi',
 'aktivasi',
 'aktivitas',
 'akucintaislam',
 'akumulasi',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'aliando',
 'all',
 'allah',
 'allahaamiin',
 'alphard',
 'alquran',
 'alur',
 'aman',
 'amanda',
 'ambil',
 'amin',
 'ampuun',
 'an',
 'anabdu

In [35]:
#membuat vocabulary baru berdasarakan feature ayng terseleksi

new_selected_feature = {}

for (k,v) in vec_tfidf.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'promo': 2342,
 'beli': 323,
 'paket': 2136,
 'flash': 873,
 'gb': 945,
 'my': 1929,
 'telkomsel': 2923,
 'app': 162,
 'extra': 843,
 'kuota': 1598,
 'lte': 1701,
 'telpon': 2926,
 'mnthr': 1880,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3062,
 'sk': 2739,
 'rupiah': 2551,
 'ribu': 2500,
 'spesial': 2798,
 'pilih': 2222,
 'aktif': 66,
 'sd': 2605,
 'november': 2038,
 'langgan': 1626,
 'hormat': 1120,
 'sisa': 2732,
 'kb': 1446,
 'download': 754,
 'mytelkomsel': 1931,
 'apps': 167,
 'httptselmetsel': 1159,
 'kuotabeli': 1599,
 'hubung': 1172,
 'skb': 2740,
 'ekstra': 806,
 'pulsa': 2379,
 'rb': 2439,
 'internet': 1254,
 'bulan': 466,
 'sjk': 2738,
 'augsept': 217,
 'detail': 665,
 'iring': 1276,
 'tarif': 2889,
 'panjang': 2149,
 'hits': 1109,
 'armada': 180,
 'curi': 600,
 'hati': 1073,
 'tekan': 2917,
 'okcall': 2090,
 'informasi': 1224,
 'eks': 803,
 'loh': 1688,
 'internetan': 1255,
 'pakai': 2134,
 'volume': 3178,
 'ultima': 3115,
 'mbhr': 1787,
 'harga': 1061,
 'tariflokasi': 2

In [36]:
len(new_selected_feature)

3000

In [37]:
pickle.dump(new_selected_feature, open("new_selected_feature_tfidf.sav", "wb"))

In [38]:
#menampilkan fitur2 yang sudah diseleksi
data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [40]:
import random
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import  MultinomialNB

In [41]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [42]:
print('banyaknya x training = ', len(x_train))
print('banyaknya x test = ', len(x_test))
print('banyaknya y training = ', len(y_train))
print('banyaknya y test = ', len(y_test))

banyaknya x training =  914
banyaknya x test =  229
banyaknya y training =  914
banyaknya y test =  229


In [43]:
text_algorithm = MultinomialNB()

In [44]:
model = text_algorithm.fit(x_train, y_train)

In [48]:
#membuat model prediksi
data_input = ("uang kirim rek bank bni an sdr abdul rahman nomor rek sms kirim terima kasih")
data_input = data_process(data_input)

#load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tfidf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Fraud"
else:
    s = "SMS Promo"

print("hasil prediksi : \n", s)

hasil prediksi : 
 SMS Fraud


In [49]:
#evaluasi model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96       126
           1       0.94      0.89      0.91        66
           2       0.87      0.89      0.88        37

    accuracy                           0.93       229
   macro avg       0.92      0.92      0.92       229
weighted avg       0.93      0.93      0.93       229



In [50]:
#save model

pickle.dump(model, open("model_fraud.sav", "wb"))