In [72]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from itertools import islice
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
import pickle

# Set random seed
np.random.seed(0)

In [73]:
#load training data
tweet_df = pd.read_csv("data_train.csv")
tweet_df.columns

Index(['Id', 'Tweet', 'following', 'followers', 'actions', 'is_retweet',
       'location', 'Type'],
      dtype='object')

In [74]:
#Melihat lima data teratas
tweet_df.head()

Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location,Type
0,10091,It's the everything else that's complicated. #...,0.0,11500.0,,0.0,Chicago,Quality
1,10172,Eren sent a glare towards Mikasa then nodded a...,0.0,0.0,,0.0,,Quality
2,7012,I posted a new photo to Facebook http://fb.me/...,0.0,0.0,,0.0,"Scotland, U.K",Quality
3,3697,#jan Idiot Chelsea Handler Diagnoses Trump Wit...,3319.0,611.0,294.0,0.0,FBBIGBANG&2NE1TH,Spam
4,10740,Pedophile Anthony Weiner is TERRIFIED of Getti...,4840.0,1724.0,1522.0,0.0,www.instagram.com/fender,Spam


In [75]:
#Melihat isi dan jumlah dari 'Type'
tweet_df['Type'].value_counts()

Quality    6153
Spam       5815
Name: Type, dtype: int64

In [76]:
#Normalisasi
tweet_df['Type'].value_counts(normalize=True)

Quality    0.514121
Spam       0.485879
Name: Type, dtype: float64

In [77]:
#Melihat Statistik
tweet_df.describe()

Unnamed: 0,Id,following,followers,actions,is_retweet
count,11968.0,11823.0,11952.0,9195.0,11967.0
mean,6292.27373,4787.11,366876.3,7314.563893,0.223949
std,3632.510153,31582.89,3973314.0,16468.215132,0.416906
min,1.0,0.0,0.0,0.0,0.0
25%,3150.75,0.0,0.0,10.0,0.0
50%,6289.5,51.0,841.5,1487.0,0.0
75%,9442.25,3635.0,13287.75,7265.5,0.0
max,12598.0,1600000.0,105000000.0,165865.0,1.0


In [78]:
#Melihat Statistik non-angka
tweet_df.describe(exclude=[np.number])

Unnamed: 0,Tweet,location,Type
count,11968,10317,11968
unique,11787,2893,2
top,[HAPPY BIRTHDAY TAEYANG]\noriginally posted by...,United States,Quality
freq,10,2521,6153


In [69]:
#Mengubah tipe data pada "Type" menjadi angka (0 dan 1)
#Type = {'Spam': 1,'Quality': 0}
#tweet_df.Type = [Type[item] for item in tweet_df.Type]

In [79]:
labels = tweet_df.Type
labels.head()

0    Quality
1    Quality
2    Quality
3       Spam
4       Spam
Name: Type, dtype: object

In [80]:
#Membagi data latih menjadi data latih dan data validasi
X_train, X_valid, y_train, y_valid = train_test_split(tweet_df["Tweet"], tweet_df["Type"], test_size=0.2)

In [81]:
print("Training Data: {}, Validation: {}".format(len(X_train), len(X_valid)))

Training Data: 9574, Validation: 2394


In [82]:
#vectorizing data (mengubah kata-kata menjadi angka yang dapat diinterpretasikan oleh model)
tweet_df_v = CountVectorizer(max_features=5000, binary=True, stop_words="english")

In [83]:
#Mengimplementasikan proses vektorisasi pada data latih
tweet_df_v.fit(X_train)
X_train_v = tweet_df_v.transform(X_train)
X_valid_v = tweet_df_v.transform(X_valid)

In [84]:
#Melihat hasil vektorisasi
tweet_df_v.vocabulary_

{'idol': 2199,
 'singer': 4020,
 'successful': 4263,
 'fan': 1632,
 'sones': 4092,
 'proud': 3462,
 'fandom': 1634,
 'ijwd1stwin': 2210,
 'video': 4735,
 'watch': 4809,
 'goals': 1903,
 'helped': 2061,
 'jonathan': 2406,
 'receive': 3586,
 'http': 2161,
 'youtu': 4983,
 'everybody': 1558,
 'follow': 1747,
 'instagram': 2286,
 'amazing': 290,
 'photos': 3269,
 'user': 4689,
 'twitter': 4627,
 'com': 985,
 'understand': 4650,
 'cause': 814,
 'goes': 1906,
 'talented': 4343,
 'oscarhasnocolor': 3148,
 've': 4716,
 'seen': 3910,
 'people': 3239,
 'facebook': 1607,
 'going': 1907,
 'leave': 2562,
 'strange': 4228,
 'fence': 1675,
 'way': 4818,
 'right': 3742,
 'wing': 4874,
 'loved': 2687,
 'time': 4493,
 'crap': 1130,
 'lost': 2676,
 'activity': 183,
 'failed': 1615,
 'latest': 2529,
 'yoga': 4975,
 'day': 1221,
 'directive': 1337,
 'govt': 1931,
 'faces': 1608,
 'charges': 863,
 'agenda': 234,
 'daily': 1191,
 'news': 3025,
 'bit': 579,
 'ly': 2704,
 'want': 4785,
 'turn': 4615,
 'car': 7

In [85]:
#Memasukan hasil vektorisasi kedalam list
list(islice(tweet_df_v.vocabulary_.items(), 20))

[('idol', 2199),
 ('singer', 4020),
 ('successful', 4263),
 ('fan', 1632),
 ('sones', 4092),
 ('proud', 3462),
 ('fandom', 1634),
 ('ijwd1stwin', 2210),
 ('video', 4735),
 ('watch', 4809),
 ('goals', 1903),
 ('helped', 2061),
 ('jonathan', 2406),
 ('receive', 3586),
 ('http', 2161),
 ('youtu', 4983),
 ('everybody', 1558),
 ('follow', 1747),
 ('instagram', 2286),
 ('amazing', 290)]

In [86]:
#K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train_v, y_train)

KNeighborsClassifier(n_neighbors=3, weights='distance')

In [87]:
#Menghitung nilai validitas antar data latih
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train_v, y_train)))
print('Accuracy of K-NN classifier on validation set: {:.2f}'.format(knn.score(X_valid_v, y_valid)))

Accuracy of K-NN classifier on training set: 1.00
Accuracy of K-NN classifier on validation set: 0.81


In [88]:
tweet_df_test = pd.read_csv("data_test.csv")
tweet_df_test.head()

Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location
0,8536,Obama Criminal Enterprise Collapsing https://...,10.0,4,1214.0,0,UP THRU DERE!
1,5214,I only learned to dream in sound #love,63.0,55,508.0,0,"Johannesburg, South Africa"
2,7437,Cause I ain't trying to out here thinking you ...,0.0,0,0.0,0,
3,10009,When will they get that it's about #Liberty ? ...,2310.0,2736,32188.0,1,"Shreveport, LA"
4,4672,GM UAW workers to receive profit-sharing up to...,696.0,176,713.0,1,EVERYWHERE


In [89]:
# Melihat laporan confusion matrix dan classification (precision, recall, F1-score)
ytest = np.array(y_valid)

In [90]:
# Predict and calculate accuracy
y_pred=knn.predict(X_valid_v)
score=accuracy_score(ytest,y_pred)
print(f'Akurasi: {round(score*100,2)}%')

Akurasi: 80.95%


In [92]:
confusion_matrix(ytest,y_pred, labels=['Quality','Spam'])

array([[1150,   71],
       [ 385,  788]], dtype=int64)

In [93]:
#Membuat vectorizer
with open('vectorizer.pickle', 'wb') as f:
    pickle.dump(tweet_df_v, f)

In [94]:
#Membuat model
with open('model_spam.pickle', 'wb') as f:
    pickle.dump(knn, f)