In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')

# Preprocessing steps
def preprocess_text(text):
    try:
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\d+', '', text)

        # Tokenize the text into words
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Join the tokens back into a string
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
    except UnicodeDecodeError:
        return ''


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('spam.csv')
new_column_names = {'v1': 'label', 'v2': 'text'}
data = data.rename(columns=new_column_names)
data['text'] = data['text'].apply(preprocess_text)
required_columns = ['label', 'text']
data = data[required_columns]

data

Unnamed: 0,label,text
0,spam,get ripped abs days revolutionary fitness program
1,spam,limited time offer buy one get one free visit ...
2,spam,urgent account compromised please update passw...
3,ham,go jurong point crazy available bugis n great ...
4,ham,ok lar joking wif u oni
...,...,...
5570,spam,nd time tried contact u u pound prize claim ea...
5571,ham,b going esplanade fr home
5572,ham,pity mood soany suggestions
5573,ham,guy bitching acted like id interested buying s...


In [5]:
# Separate the features (email text) and the labels (spam or not spam)
X = data['text']
y = data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert text into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [6]:
# Train a Support Vector Machine (SVM) classifier
classifier = SVC()
classifier.fit(X_train_vectorized, y_train)

In [7]:
# Vectorize the test set and make predictions
X_test_vectorized = vectorizer.transform(X_test)
predictions = classifier.predict(X_test_vectorized)

In [8]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9766816143497757


In [9]:
import pickle 

with open('model.pkl', 'wb') as file:
    pickle.dump(classifier,file)

In [10]:

# Load the saved model
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

# Function to classify text
def classify_text(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)  # Assuming you have defined the preprocess_text() function

    # Vectorize the preprocessed text
    vectorized_text = vectorizer.transform([preprocessed_text])  # Assuming you have defined the vectorizer

    # Make predictions using the loaded model
    prediction = model.predict(vectorized_text)
    return prediction[0]

# Example usage.
text = "Limited time offer! Buy one, get one free. Visit our website for more details."
prediction = classify_text(text)
print("Text:", text)
print("Prediction:", prediction)


Text: Limited time offer! Buy one, get one free. Visit our website for more details.
Prediction: spam
