In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('output.csv', encoding='latin1')
df

Unnamed: 0,Label,Text
0,ham,Subject: christmas tree farm pictures
1,ham,"Subject: vastar resources , inc ."
2,ham,"gary , production from the high island larger ..."
3,ham,"saturday at 2 : 00 p . m . at about 6 , 500 gr..."
4,ham,"10 , 000 gross for tomorrow . vastar owns 68 %..."
...,...,...
288268,spam,discounted software store
288269,spam,http : / / yielded . jetlow . com /
288270,spam,its never just a game when you ' re winning .
288271,spam,character is who you are when no one is looking .


In [3]:
df.rename(columns = {'Label': 'target', 'Text': 'text'}, inplace = True)
df

Unnamed: 0,target,text
0,ham,Subject: christmas tree farm pictures
1,ham,"Subject: vastar resources , inc ."
2,ham,"gary , production from the high island larger ..."
3,ham,"saturday at 2 : 00 p . m . at about 6 , 500 gr..."
4,ham,"10 , 000 gross for tomorrow . vastar owns 68 %..."
...,...,...
288268,spam,discounted software store
288269,spam,http : / / yielded . jetlow . com /
288270,spam,its never just a game when you ' re winning .
288271,spam,character is who you are when no one is looking .


In [4]:
#checking missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [5]:
#check duplicate values
df.duplicated().sum()

115488

In [6]:
#remove Duplicate
df = df.drop_duplicates(keep = 'first')

In [2]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
   
    text = y[:]
    y.clear()
    
    # Removing stop words and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    
    text = y[:]
    y.clear()
    
    # Stemming using Porter Stemmer
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [8]:
df['transformed_text'] = df['text'].apply(transform_text)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transformed_text'] = df['text'].apply(transform_text)


Unnamed: 0,target,text,transformed_text
0,ham,Subject: christmas tree farm pictures,subject christma tree farm pictur
1,ham,"Subject: vastar resources , inc .",subject vastar resourc inc
2,ham,"gary , production from the high island larger ...",gari product high island larger block 1 2 commenc
3,ham,"saturday at 2 : 00 p . m . at about 6 , 500 gr...",saturday 2 00 p 6 500 gross carlo expect 9 500
4,ham,"10 , 000 gross for tomorrow . vastar owns 68 %...",10 000 gross tomorrow vastar own 68 gross product
...,...,...,...
288267,spam,Subject: microsoft autoroute 2005 dvd uk - $ 1...,subject microsoft autorout 2005 dvd uk 19 95
288268,spam,discounted software store,discount softwar store
288269,spam,http : / / yielded . jetlow . com /,http yield jetlow com
288270,spam,its never just a game when you ' re winning .,never game win


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=3000)

In [10]:
X = tfid.fit_transform(df['transformed_text'])
y = df['target'].values

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

In [12]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [13]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()

In [14]:
svc.fit(X_train, y_train)
knc.fit(X_train, y_train)

In [15]:
from sklearn.metrics import accuracy_score, precision_score

y_predSVC = svc.predict(X_test)
y_predKNC = knc.predict(X_test)

In [16]:
accuracySVC = accuracy_score(y_test, y_predSVC)
print(f'Accuracy for SVC: {accuracySVC}')

Accuracy for SVC: 0.8035130364325607


In [17]:
accuracyKNC = accuracy_score(y_test, y_predKNC)
print(f'Accuracy for KNC: {accuracyKNC}')

Accuracy for KNC: 0.8022976531527621


In [18]:
inputtext = 'Claim your free iPhone X today! You ve been selected as one of our valued customers to receive this limited-time offer'
transformed_text = transform_text(inputtext)

In [19]:
vector_input = tfid.transform([transformed_text])

In [20]:
result = svc.predict(vector_input)[0]
result

'spam'

In [None]:
import pickle
pickle.dump(svc,open('svcmodel.pkl','wb'))
pickle.dump(tfid,open('vectorizer.pkl','wb'))