In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
!ls


drive  sample_data  spam.csv


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
print("Data loaded! First 5 rows:")
print(df.head())

Data loaded! First 5 rows:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
!pip install nltk




In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabets
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['clean_text'] = df['text'].apply(clean_text)
print("\nText cleaned! Example:")
print(df['clean_text'][0])  # Show first cleaned SMS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



Text cleaned! Example:
go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [None]:
df.head()

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [None]:
print("Original:\n", df['text'].head(2))
print("\nCleaned:\n", df['clean_text'].head(2))

Original:
 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
Name: text, dtype: object

Cleaned:
 0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
Name: clean_text, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF (limit to top 5000 words to avoid overloading)
tfidf = TfidfVectorizer(max_features=5000)

# Fit to cleaned text and transform to numbers
X = tfidf.fit_transform(df['clean_text']).toarray()

# Check the shape: (number of messages, number of features)
print("Shape of X:", X.shape)

Shape of X: (5572, 5000)


In [None]:
words = tfidf.get_feature_names_out()

first_message_scores = X[0]

for word, score in zip(words, first_message_scores):
    if score > 0:
        print(f"{word}: {score:.3f}")

avail: 0.282
buffet: 0.359
bugi: 0.318
cine: 0.318
crazi: 0.291
go: 0.150
got: 0.176
great: 0.208
jurong: 0.376
la: 0.305
point: 0.257
wat: 0.208
world: 0.252


In [None]:
y = df['label'].map({'ham': 0, 'spam': 1})

print("Shape of y:", y.shape)
print("First 5 labels:", y[:5].values)

Shape of y: (5572,)
First 5 labels: [0 0 1 0 0]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 96.05%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       963
           1       0.98      0.72      0.83       152

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [None]:
def predict_spam(sms):

    cleaned = clean_text(sms)

    vector = tfidf.transform([cleaned]).toarray()

    return "Spam" if model.predict(vector)[0] == 1 else "Ham"


print(predict_spam("WINNER!! Claim your free prize now!"))
print(predict_spam("Hey, can we meet tomorrow?"))

Spam
Ham
