# Spam Text Classifier

In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Reading the data

In [2]:
text_data = pd.read_csv("data/spam.csv")
text_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
text_data.shape

(5572, 2)

## Data Preprocessing 
### Step 1 - Stemming

In [4]:
stemmer = PorterStemmer()
corpus = []

for i in range(len(text_data)):
    print("At sentence: {}".format(i))
    
    review = re.sub("[^a-zA-Z]", " ", text_data["Message"][i])
    review = review.lower()
    review = review.split()
    
    review = [stemmer.stem(word) for word in review if not word in set(stopwords.words("english"))]
    review = ' '.join(review)
    corpus.append(review)

At sentence: 0
At sentence: 1
At sentence: 2
At sentence: 3
At sentence: 4
At sentence: 5
At sentence: 6
At sentence: 7
At sentence: 8
At sentence: 9
At sentence: 10
At sentence: 11
At sentence: 12
At sentence: 13
At sentence: 14
At sentence: 15
At sentence: 16
At sentence: 17
At sentence: 18
At sentence: 19
At sentence: 20
At sentence: 21
At sentence: 22
At sentence: 23
At sentence: 24
At sentence: 25
At sentence: 26
At sentence: 27
At sentence: 28
At sentence: 29
At sentence: 30
At sentence: 31
At sentence: 32
At sentence: 33
At sentence: 34
At sentence: 35
At sentence: 36
At sentence: 37
At sentence: 38
At sentence: 39
At sentence: 40
At sentence: 41
At sentence: 42
At sentence: 43
At sentence: 44
At sentence: 45
At sentence: 46
At sentence: 47
At sentence: 48
At sentence: 49
At sentence: 50
At sentence: 51
At sentence: 52
At sentence: 53
At sentence: 54
At sentence: 55
At sentence: 56
At sentence: 57
At sentence: 58
At sentence: 59
At sentence: 60
At sentence: 61
At sentence: 62
At

At sentence: 512
At sentence: 513
At sentence: 514
At sentence: 515
At sentence: 516
At sentence: 517
At sentence: 518
At sentence: 519
At sentence: 520
At sentence: 521
At sentence: 522
At sentence: 523
At sentence: 524
At sentence: 525
At sentence: 526
At sentence: 527
At sentence: 528
At sentence: 529
At sentence: 530
At sentence: 531
At sentence: 532
At sentence: 533
At sentence: 534
At sentence: 535
At sentence: 536
At sentence: 537
At sentence: 538
At sentence: 539
At sentence: 540
At sentence: 541
At sentence: 542
At sentence: 543
At sentence: 544
At sentence: 545
At sentence: 546
At sentence: 547
At sentence: 548
At sentence: 549
At sentence: 550
At sentence: 551
At sentence: 552
At sentence: 553
At sentence: 554
At sentence: 555
At sentence: 556
At sentence: 557
At sentence: 558
At sentence: 559
At sentence: 560
At sentence: 561
At sentence: 562
At sentence: 563
At sentence: 564
At sentence: 565
At sentence: 566
At sentence: 567
At sentence: 568
At sentence: 569
At sentence: 5

At sentence: 1002
At sentence: 1003
At sentence: 1004
At sentence: 1005
At sentence: 1006
At sentence: 1007
At sentence: 1008
At sentence: 1009
At sentence: 1010
At sentence: 1011
At sentence: 1012
At sentence: 1013
At sentence: 1014
At sentence: 1015
At sentence: 1016
At sentence: 1017
At sentence: 1018
At sentence: 1019
At sentence: 1020
At sentence: 1021
At sentence: 1022
At sentence: 1023
At sentence: 1024
At sentence: 1025
At sentence: 1026
At sentence: 1027
At sentence: 1028
At sentence: 1029
At sentence: 1030
At sentence: 1031
At sentence: 1032
At sentence: 1033
At sentence: 1034
At sentence: 1035
At sentence: 1036
At sentence: 1037
At sentence: 1038
At sentence: 1039
At sentence: 1040
At sentence: 1041
At sentence: 1042
At sentence: 1043
At sentence: 1044
At sentence: 1045
At sentence: 1046
At sentence: 1047
At sentence: 1048
At sentence: 1049
At sentence: 1050
At sentence: 1051
At sentence: 1052
At sentence: 1053
At sentence: 1054
At sentence: 1055
At sentence: 1056
At sentenc

At sentence: 1463
At sentence: 1464
At sentence: 1465
At sentence: 1466
At sentence: 1467
At sentence: 1468
At sentence: 1469
At sentence: 1470
At sentence: 1471
At sentence: 1472
At sentence: 1473
At sentence: 1474
At sentence: 1475
At sentence: 1476
At sentence: 1477
At sentence: 1478
At sentence: 1479
At sentence: 1480
At sentence: 1481
At sentence: 1482
At sentence: 1483
At sentence: 1484
At sentence: 1485
At sentence: 1486
At sentence: 1487
At sentence: 1488
At sentence: 1489
At sentence: 1490
At sentence: 1491
At sentence: 1492
At sentence: 1493
At sentence: 1494
At sentence: 1495
At sentence: 1496
At sentence: 1497
At sentence: 1498
At sentence: 1499
At sentence: 1500
At sentence: 1501
At sentence: 1502
At sentence: 1503
At sentence: 1504
At sentence: 1505
At sentence: 1506
At sentence: 1507
At sentence: 1508
At sentence: 1509
At sentence: 1510
At sentence: 1511
At sentence: 1512
At sentence: 1513
At sentence: 1514
At sentence: 1515
At sentence: 1516
At sentence: 1517
At sentenc

At sentence: 1922
At sentence: 1923
At sentence: 1924
At sentence: 1925
At sentence: 1926
At sentence: 1927
At sentence: 1928
At sentence: 1929
At sentence: 1930
At sentence: 1931
At sentence: 1932
At sentence: 1933
At sentence: 1934
At sentence: 1935
At sentence: 1936
At sentence: 1937
At sentence: 1938
At sentence: 1939
At sentence: 1940
At sentence: 1941
At sentence: 1942
At sentence: 1943
At sentence: 1944
At sentence: 1945
At sentence: 1946
At sentence: 1947
At sentence: 1948
At sentence: 1949
At sentence: 1950
At sentence: 1951
At sentence: 1952
At sentence: 1953
At sentence: 1954
At sentence: 1955
At sentence: 1956
At sentence: 1957
At sentence: 1958
At sentence: 1959
At sentence: 1960
At sentence: 1961
At sentence: 1962
At sentence: 1963
At sentence: 1964
At sentence: 1965
At sentence: 1966
At sentence: 1967
At sentence: 1968
At sentence: 1969
At sentence: 1970
At sentence: 1971
At sentence: 1972
At sentence: 1973
At sentence: 1974
At sentence: 1975
At sentence: 1976
At sentenc

At sentence: 2393
At sentence: 2394
At sentence: 2395
At sentence: 2396
At sentence: 2397
At sentence: 2398
At sentence: 2399
At sentence: 2400
At sentence: 2401
At sentence: 2402
At sentence: 2403
At sentence: 2404
At sentence: 2405
At sentence: 2406
At sentence: 2407
At sentence: 2408
At sentence: 2409
At sentence: 2410
At sentence: 2411
At sentence: 2412
At sentence: 2413
At sentence: 2414
At sentence: 2415
At sentence: 2416
At sentence: 2417
At sentence: 2418
At sentence: 2419
At sentence: 2420
At sentence: 2421
At sentence: 2422
At sentence: 2423
At sentence: 2424
At sentence: 2425
At sentence: 2426
At sentence: 2427
At sentence: 2428
At sentence: 2429
At sentence: 2430
At sentence: 2431
At sentence: 2432
At sentence: 2433
At sentence: 2434
At sentence: 2435
At sentence: 2436
At sentence: 2437
At sentence: 2438
At sentence: 2439
At sentence: 2440
At sentence: 2441
At sentence: 2442
At sentence: 2443
At sentence: 2444
At sentence: 2445
At sentence: 2446
At sentence: 2447
At sentenc

At sentence: 2862
At sentence: 2863
At sentence: 2864
At sentence: 2865
At sentence: 2866
At sentence: 2867
At sentence: 2868
At sentence: 2869
At sentence: 2870
At sentence: 2871
At sentence: 2872
At sentence: 2873
At sentence: 2874
At sentence: 2875
At sentence: 2876
At sentence: 2877
At sentence: 2878
At sentence: 2879
At sentence: 2880
At sentence: 2881
At sentence: 2882
At sentence: 2883
At sentence: 2884
At sentence: 2885
At sentence: 2886
At sentence: 2887
At sentence: 2888
At sentence: 2889
At sentence: 2890
At sentence: 2891
At sentence: 2892
At sentence: 2893
At sentence: 2894
At sentence: 2895
At sentence: 2896
At sentence: 2897
At sentence: 2898
At sentence: 2899
At sentence: 2900
At sentence: 2901
At sentence: 2902
At sentence: 2903
At sentence: 2904
At sentence: 2905
At sentence: 2906
At sentence: 2907
At sentence: 2908
At sentence: 2909
At sentence: 2910
At sentence: 2911
At sentence: 2912
At sentence: 2913
At sentence: 2914
At sentence: 2915
At sentence: 2916
At sentenc

At sentence: 3337
At sentence: 3338
At sentence: 3339
At sentence: 3340
At sentence: 3341
At sentence: 3342
At sentence: 3343
At sentence: 3344
At sentence: 3345
At sentence: 3346
At sentence: 3347
At sentence: 3348
At sentence: 3349
At sentence: 3350
At sentence: 3351
At sentence: 3352
At sentence: 3353
At sentence: 3354
At sentence: 3355
At sentence: 3356
At sentence: 3357
At sentence: 3358
At sentence: 3359
At sentence: 3360
At sentence: 3361
At sentence: 3362
At sentence: 3363
At sentence: 3364
At sentence: 3365
At sentence: 3366
At sentence: 3367
At sentence: 3368
At sentence: 3369
At sentence: 3370
At sentence: 3371
At sentence: 3372
At sentence: 3373
At sentence: 3374
At sentence: 3375
At sentence: 3376
At sentence: 3377
At sentence: 3378
At sentence: 3379
At sentence: 3380
At sentence: 3381
At sentence: 3382
At sentence: 3383
At sentence: 3384
At sentence: 3385
At sentence: 3386
At sentence: 3387
At sentence: 3388
At sentence: 3389
At sentence: 3390
At sentence: 3391
At sentenc

At sentence: 3806
At sentence: 3807
At sentence: 3808
At sentence: 3809
At sentence: 3810
At sentence: 3811
At sentence: 3812
At sentence: 3813
At sentence: 3814
At sentence: 3815
At sentence: 3816
At sentence: 3817
At sentence: 3818
At sentence: 3819
At sentence: 3820
At sentence: 3821
At sentence: 3822
At sentence: 3823
At sentence: 3824
At sentence: 3825
At sentence: 3826
At sentence: 3827
At sentence: 3828
At sentence: 3829
At sentence: 3830
At sentence: 3831
At sentence: 3832
At sentence: 3833
At sentence: 3834
At sentence: 3835
At sentence: 3836
At sentence: 3837
At sentence: 3838
At sentence: 3839
At sentence: 3840
At sentence: 3841
At sentence: 3842
At sentence: 3843
At sentence: 3844
At sentence: 3845
At sentence: 3846
At sentence: 3847
At sentence: 3848
At sentence: 3849
At sentence: 3850
At sentence: 3851
At sentence: 3852
At sentence: 3853
At sentence: 3854
At sentence: 3855
At sentence: 3856
At sentence: 3857
At sentence: 3858
At sentence: 3859
At sentence: 3860
At sentenc

At sentence: 4282
At sentence: 4283
At sentence: 4284
At sentence: 4285
At sentence: 4286
At sentence: 4287
At sentence: 4288
At sentence: 4289
At sentence: 4290
At sentence: 4291
At sentence: 4292
At sentence: 4293
At sentence: 4294
At sentence: 4295
At sentence: 4296
At sentence: 4297
At sentence: 4298
At sentence: 4299
At sentence: 4300
At sentence: 4301
At sentence: 4302
At sentence: 4303
At sentence: 4304
At sentence: 4305
At sentence: 4306
At sentence: 4307
At sentence: 4308
At sentence: 4309
At sentence: 4310
At sentence: 4311
At sentence: 4312
At sentence: 4313
At sentence: 4314
At sentence: 4315
At sentence: 4316
At sentence: 4317
At sentence: 4318
At sentence: 4319
At sentence: 4320
At sentence: 4321
At sentence: 4322
At sentence: 4323
At sentence: 4324
At sentence: 4325
At sentence: 4326
At sentence: 4327
At sentence: 4328
At sentence: 4329
At sentence: 4330
At sentence: 4331
At sentence: 4332
At sentence: 4333
At sentence: 4334
At sentence: 4335
At sentence: 4336
At sentenc

At sentence: 4751
At sentence: 4752
At sentence: 4753
At sentence: 4754
At sentence: 4755
At sentence: 4756
At sentence: 4757
At sentence: 4758
At sentence: 4759
At sentence: 4760
At sentence: 4761
At sentence: 4762
At sentence: 4763
At sentence: 4764
At sentence: 4765
At sentence: 4766
At sentence: 4767
At sentence: 4768
At sentence: 4769
At sentence: 4770
At sentence: 4771
At sentence: 4772
At sentence: 4773
At sentence: 4774
At sentence: 4775
At sentence: 4776
At sentence: 4777
At sentence: 4778
At sentence: 4779
At sentence: 4780
At sentence: 4781
At sentence: 4782
At sentence: 4783
At sentence: 4784
At sentence: 4785
At sentence: 4786
At sentence: 4787
At sentence: 4788
At sentence: 4789
At sentence: 4790
At sentence: 4791
At sentence: 4792
At sentence: 4793
At sentence: 4794
At sentence: 4795
At sentence: 4796
At sentence: 4797
At sentence: 4798
At sentence: 4799
At sentence: 4800
At sentence: 4801
At sentence: 4802
At sentence: 4803
At sentence: 4804
At sentence: 4805
At sentenc

At sentence: 5218
At sentence: 5219
At sentence: 5220
At sentence: 5221
At sentence: 5222
At sentence: 5223
At sentence: 5224
At sentence: 5225
At sentence: 5226
At sentence: 5227
At sentence: 5228
At sentence: 5229
At sentence: 5230
At sentence: 5231
At sentence: 5232
At sentence: 5233
At sentence: 5234
At sentence: 5235
At sentence: 5236
At sentence: 5237
At sentence: 5238
At sentence: 5239
At sentence: 5240
At sentence: 5241
At sentence: 5242
At sentence: 5243
At sentence: 5244
At sentence: 5245
At sentence: 5246
At sentence: 5247
At sentence: 5248
At sentence: 5249
At sentence: 5250
At sentence: 5251
At sentence: 5252
At sentence: 5253
At sentence: 5254
At sentence: 5255
At sentence: 5256
At sentence: 5257
At sentence: 5258
At sentence: 5259
At sentence: 5260
At sentence: 5261
At sentence: 5262
At sentence: 5263
At sentence: 5264
At sentence: 5265
At sentence: 5266
At sentence: 5267
At sentence: 5268
At sentence: 5269
At sentence: 5270
At sentence: 5271
At sentence: 5272
At sentenc

In [5]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

### Step 2 - Data Preparation

In [7]:
# Dependent feature
y = pd.get_dummies(text_data["Category"])
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [8]:
# By dropping ham column, we can classify text based on spam or not spam
y = y.iloc[:, 1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [16]:
# Separating training and testing data
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(corpus).iloc[:, 0], y, 
                                                    test_size = 0.20, 
                                                    random_state = 42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

### Step 3 - TF-IDF for sentence vectors

In [17]:
# We limit the max_features to 2500 because we dont want words that are insignificant for prediction
# So we are selecting 2500 frequent words
vectorizer = CountVectorizer(max_features = 2500)

In [18]:
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

X_train.shape, X_test.shape

((4457, 2500), (1115, 2500))

## Model Training

In [19]:
model = MultinomialNB()

# Fitting the model
model.fit(X_train, y_train)

In [20]:
# Predictions
y_preds = model.predict(X_test)
y_preds

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [21]:
confusion_matrix(y_test, y_preds)

array([[956,  10],
       [  9, 140]], dtype=int64)

In [22]:
score = accuracy_score(y_test, y_preds)
score

0.9829596412556054