In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [6]:
np.random.seed(500)

In [11]:
df = pd.read_csv("./data/spam.csv", delimiter=',', header=None)

In [12]:
df = df.sample(frac=1)


In [15]:
df[1] = [word_tokenize(entry) for entry in df[1]]

In [16]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [17]:
for index,entry in enumerate(df[1]):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [18]:
df['text_final']

4171              ['S', 'first', 'time', 'dhoni', 'rock']
721     ['Really', 'dun', 'bluff', 'leh', 'U', 'sleep'...
1100    ['Stop', 'call', 'everyone', 'say', 'I', 'migh...
2952    ['Final', 'Chance', 'Claim', 'ur', 'worth', 'd...
4093          ['I', 'dont', 'thnk', 'wrong', 'call', 'u']
                              ...                        
2171                       ['I', 'take', 'exam', 'march']
2915                      ['Hey', 'angry', 'Reply', 'dr']
658                                             ['ERROR']
475     ['This', 'time', 'try', 'contact', 'U', 'Pound...
1975    ['No', 'I', 'boat', 'Still', 'mom', 'Check', '...
Name: text_final, Length: 5572, dtype: object

In [19]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'],df[0],test_size=0.2)

In [21]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [22]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

SVC(gamma='auto', kernel='linear')

In [23]:
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  87.17488789237669
