In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/SPAM_SMS.csv'
df = pd.read_csv(file_path)

In [None]:
print("Data Head:")
print(df.head())

Data Head:
  masked_celphone_number                hashed_celphone_number  \
0          +63929****732  67a65d11-9add-3cfc-93e7-f6e5a7ef76e5   
1               SmartNas  ada1411e-7c6e-37d8-bff9-fcd70b4dde30   
2            AGRIBANK PH  04cbdae3-8826-3625-8f0a-e3a8ed51d3db   
3               SmartNas  ada1411e-7c6e-37d8-bff9-fcd70b4dde30   
4               SmartNas  ada1411e-7c6e-37d8-bff9-fcd70b4dde30   

                      date                                               text  \
0  2025-08-27 09:28:23.142  [FEX] Your parcel P6115DDCDD7BA couldn't reach...   
1  2025-08-19 08:57:21.957  Smart reminds you:Your Reward Points (6,309) e...   
2  2025-08-12 22:16:30.612  Your Landbank iAccess account is about to be d...   
3  2025-08-10 10:04:33.960  Smart reminds you:Your Reward Points (6,309) e...   
4  2025-08-09 05:25:40.334  Smart reminds you:Your Reward Points (6,309) e...   

     carrier  
0      Smart  
1  <unknown>  
2  <unknown>  
3  <unknown>  
4  <unknown>  


In [None]:
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   masked_celphone_number  1014 non-null   object
 1   hashed_celphone_number  1014 non-null   object
 2   date                    1014 non-null   object
 3   text                    1014 non-null   object
 4   carrier                 1014 non-null   object
dtypes: object(5)
memory usage: 39.7+ KB


In [None]:
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
       masked_celphone_number                hashed_celphone_number  \
count                    1014                                  1014   
unique                    903                                   953   
top                       BDO  ec6149ca-baa1-3982-b6de-970a695330f5   
freq                       21                                    21   

                           date                        text carrier  
count                      1014                        1014    1014  
unique                      993                         938      10  
top     2025-06-03 10:31:04.251  <<Content not supported.>>     TNT  
freq                          2                          13     348  


In [None]:
print("\nMissing Values per Column:")
print(df.isnull().sum())


Missing Values per Column:
masked_celphone_number    0
hashed_celphone_number    0
date                      0
text                      0
carrier                   0
dtype: int64


In [None]:
df.drop(columns=['masked_celphone_number', 'hashed_celphone_number', 'date'], inplace=True)
print(df.head())

                                                text    carrier
0  [FEX] Your parcel P6115DDCDD7BA couldn't reach...      Smart
1  Smart reminds you:Your Reward Points (6,309) e...  <unknown>
2  Your Landbank iAccess account is about to be d...  <unknown>
3  Smart reminds you:Your Reward Points (6,309) e...  <unknown>
4  Smart reminds you:Your Reward Points (6,309) e...  <unknown>


In [None]:
df = pd.get_dummies(df, columns=['carrier'], drop_first=True)
print(df.head())

                                                text  carrier_DITO  \
0  [FEX] Your parcel P6115DDCDD7BA couldn't reach...         False   
1  Smart reminds you:Your Reward Points (6,309) e...         False   
2  Your Landbank iAccess account is about to be d...         False   
3  Smart reminds you:Your Reward Points (6,309) e...         False   
4  Smart reminds you:Your Reward Points (6,309) e...         False   

   carrier_Globe  carrier_Globe PostPaid  carrier_Globe/TM  carrier_Smart  \
0          False                   False             False           True   
1          False                   False             False          False   
2          False                   False             False          False   
3          False                   False             False          False   
4          False                   False             False          False   

   carrier_Smart/Sun  carrier_Smart/TNT  carrier_Sun  carrier_TNT  
0              False              False        F

In [None]:
# ===============================
# 🧹 2. Create a 'label' Column Automatically
# ===============================
# Define spam keywords (you can add more)
spam_keywords = ['win', 'free', 'prize', 'cash', 'reward', 'claim', 'promo', 'congratulations', 'loan', 'click']


def label_message(text):
    # Ensure text is treated as string; handle potential non-string types if any were present
    text = str(text)
    text = text.lower()
    return 'spam' if any(word in text for word in spam_keywords) else 'ham'

df['label'] = df['text'].apply(label_message)
print("\nLabel distribution:")
print(df['label'].value_counts())


Label distribution:
label
spam    600
ham     414
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.8325123152709359

Classification Report:
               precision    recall  f1-score   support

         ham       0.86      0.66      0.75        77
        spam       0.82      0.94      0.87       126

    accuracy                           0.83       203
   macro avg       0.84      0.80      0.81       203
weighted avg       0.84      0.83      0.83       203



In [None]:
sample = ["Congratulations! You won a free GCash prize!"]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: spam


In [None]:
import joblib
joblib.dump(model, 'spam_detector.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved.")


Model and vectorizer saved.


In [None]:
sample = ["click this link ->"]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: spam


In [None]:
sample = ["hello this is mapua malayan colleges of mindanao"]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: spam


In [None]:
df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
spam,600
ham,414


In [None]:
df[df['label'] == 'ham']


Unnamed: 0,text,carrier_DITO,carrier_Globe,carrier_Globe PostPaid,carrier_Globe/TM,carrier_Smart,carrier_Smart/Sun,carrier_Smart/TNT,carrier_Sun,carrier_TNT,label
0,[FEX] Your parcel P6115DDCDD7BA couldn't reach...,False,False,False,False,True,False,False,False,False,ham
8,Live DJ and awesome music waiting for you. Dro...,False,False,False,True,False,False,False,False,False,ham
13,Good day! \nCome and join us here at Vus Sky B...,False,False,False,True,False,False,False,False,False,ham
18,You’re invited to join us tonight at Vu's Sky ...,False,False,False,True,False,False,False,False,False,ham
20,Don't miss out!!\nBDO FIXED RATE ASEAN SUSTAIN...,False,False,False,False,False,False,False,False,False,ham
...,...,...,...,...,...,...,...,...,...,...,...
1009,Shangri-La Updates:\n\nRISEmakati\n\n-One-of-a...,False,False,False,False,True,False,False,False,False,ham
1010,Celebrate the season of love with a sweet stay...,False,False,True,False,False,False,False,False,False,ham
1011,New OFFICE space for sale in Ortigas Avenue ne...,False,False,False,False,True,False,False,False,False,ham
1012,New OFFICE space for sale in Ortigas Avenue ne...,False,False,False,False,True,False,False,False,False,ham


In [None]:
sample = ["[FEX] Your parcel P6115DDCDD7BA couldn't reach you. Please contact the branch at 9283170245 for any concerns."]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: ham


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
spam,600
ham,414


In [None]:
print("Accuracy:", accuracy_score(y_test, model.predict(X_test_tfidf)))

Accuracy: 0.8325123152709359


In [None]:
df[df['label'] == 'ham']

Unnamed: 0,text,carrier_DITO,carrier_Globe,carrier_Globe PostPaid,carrier_Globe/TM,carrier_Smart,carrier_Smart/Sun,carrier_Smart/TNT,carrier_Sun,carrier_TNT,label
0,[FEX] Your parcel P6115DDCDD7BA couldn't reach...,False,False,False,False,True,False,False,False,False,ham
8,Live DJ and awesome music waiting for you. Dro...,False,False,False,True,False,False,False,False,False,ham
13,Good day! \nCome and join us here at Vus Sky B...,False,False,False,True,False,False,False,False,False,ham
18,You’re invited to join us tonight at Vu's Sky ...,False,False,False,True,False,False,False,False,False,ham
20,Don't miss out!!\nBDO FIXED RATE ASEAN SUSTAIN...,False,False,False,False,False,False,False,False,False,ham
...,...,...,...,...,...,...,...,...,...,...,...
1009,Shangri-La Updates:\n\nRISEmakati\n\n-One-of-a...,False,False,False,False,True,False,False,False,False,ham
1010,Celebrate the season of love with a sweet stay...,False,False,True,False,False,False,False,False,False,ham
1011,New OFFICE space for sale in Ortigas Avenue ne...,False,False,False,False,True,False,False,False,False,ham
1012,New OFFICE space for sale in Ortigas Avenue ne...,False,False,False,False,True,False,False,False,False,ham


In [None]:
sample = ["""Hi,

Your G-Cash account has been temporarily deactivated!
Please update your G-Cash as soon as possible.

Confirm your account here: kycupdategcash.online
Thank you for choosing GCash!"""]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: spam


In [None]:
Youâ€™re invited to join us tonight at Vu's Sky Bar & Lounge, Marco Polo Hotel Ortigas Manila for a great night of drinks, music, and good vibes! ðŸ¸âœ¨
Weâ€™re open and ready to welcome youâ€”come and chill with us!

Thank you!

Kit Miculob
Vu's Receptionist

In [None]:
sample = ["""#BDOStopScam: IGNORE emails/SMS with links to "update" your account. These are scams! BDO will not send links, so DON'T CLICK links and DON'T SHARE your OTP!
"""]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", model.predict(sample_tfidf)[0])


Sample Prediction: spam


In [None]:
# Transform all texts in your dataset
X_tfidf = vectorizer.transform(df['text'])

# Predict labels (spam/ham)
df['predicted_label'] = model.predict(X_tfidf)


In [None]:
pd.set_option('display.max_rows', None)   # show all rows
pd.set_option('display.max_colwidth', None)  # show full text without cutting

df  # just type the dataframe name to display


Unnamed: 0,text,carrier_DITO,carrier_Globe,carrier_Globe PostPaid,carrier_Globe/TM,carrier_Smart,carrier_Smart/Sun,carrier_Smart/TNT,carrier_Sun,carrier_TNT,label,predicted_label
0,[FEX] Your parcel P6115DDCDD7BA couldn't reach you. Please contact the branch at 9283170245 for any concerns.,False,False,False,False,True,False,False,False,False,ham,ham
1,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smartk.bond/rewards",False,False,False,False,False,False,False,False,False,spam,spam
2,"Your Landbank iAccess account is about to be deducted PHP3,456.00 for AGRIBANK PH services.\n\nIf this is not you, cancel it immediately at https://lbpiaccess.com-ph.click/i-revoke",False,False,False,False,False,False,False,False,False,spam,spam
3,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smartst.click/rewards",False,False,False,False,False,False,False,False,False,spam,spam
4,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smartlt.help/rewards",False,False,False,False,False,False,False,False,False,spam,spam
5,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smart1.cyou/rewards",False,False,False,False,False,False,False,False,False,spam,spam
6,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smartan.help/rewards",False,False,False,False,False,False,False,False,False,spam,spam
7,"Globe reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://globeci.cyou/rewards",False,False,False,False,False,False,False,False,False,spam,spam
8,"Live DJ and awesome music waiting for you. Drop by and enjoy the night with us here at Vu's Sky Bar & Lounge, Marco Polo Hotel Ortigas Manila. \n\nThank you!\n\nKit Miculob\nVu's Receptionist",False,False,False,True,False,False,False,False,False,ham,ham
9,"Smart reminds you:Your Reward Points (6,309) expire today.Please redeem you gift soon: https://smartnr.qpon/rewards",False,False,False,False,False,False,False,False,False,spam,spam


In [None]:
df.to_csv('Predicted_SPAM_SMS.csv', index=False)

In [None]:
from google.colab import files
files.download('Predicted_SPAM_SMS.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("Accuracy:", accuracy_score(y_test, model.predict(X_test_tfidf)))

Accuracy: 0.8325123152709359
