In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
#loading the data from csv file to a pandas dataframe
raw_mail_data = pd.read_csv('/content/spam_mail_data_1000.csv')

In [8]:
raw_mail_data.head()

Unnamed: 0,label,text
0,0,Let's catch up for lunch next week.
1,0,Can you help me with the assignment due tomorrow?
2,1,Urgent: Your account has been compromised. Cli...
3,0,"Hey, are we still on for the meeting tomorrow?"
4,1,Win big cash prizes! Limited time offer!


In [9]:
print(raw_mail_data)

     label                                               text
0        0                Let's catch up for lunch next week.
1        0  Can you help me with the assignment due tomorrow?
2        1  Urgent: Your account has been compromised. Cli...
3        0     Hey, are we still on for the meeting tomorrow?
4        1           Win big cash prizes! Limited time offer!
..     ...                                                ...
995      1                        Get rich quick! Join today!
996      1    Don't miss out on this opportunity to earn big.
997      0            Looking forward to our trip next month!
998      0          Thanks for your help on the presentation.
999      0          Thanks for your help on the presentation.

[1000 rows x 2 columns]




```python
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')
```

### 🔍 Here's what it's doing:
It replaces **any `NaN` (null/missing values)** in the DataFrame `raw_mail_data` with an **empty string `''`**, and saves the result into a new DataFrame `mail_data`.

### 🔧 Breakdown:
- `pd.notnull(raw_mail_data)` returns a DataFrame of the same shape with `True` where values are **not null** and `False` where values are **null**.
- `DataFrame.where(condition, other)` keeps the value **where the condition is True**, and replaces it with **`other`** (in this case, `''`) where it's False.

### ✅ Use Case:
This is useful when you're working with **text data**, like email content, and you want to clean it up before applying NLP techniques—many models don't handle `NaN` values well.

---

Let me know if you're planning to build a spam classifier now—I can walk you through TF-IDF vectorization, model training (like Naive Bayes), and evaluation too!

In [11]:
#replace the null values with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [12]:
mail_data.head()

Unnamed: 0,label,text
0,0,Let's catch up for lunch next week.
1,0,Can you help me with the assignment due tomorrow?
2,1,Urgent: Your account has been compromised. Cli...
3,0,"Hey, are we still on for the meeting tomorrow?"
4,1,Win big cash prizes! Limited time offer!


In [14]:
mail_data.shape

(1000, 2)

#spam --> 1
#genuine mail --> 0

In [16]:
#separating the data as text and label
X = mail_data['text']
Y = mail_data['label']

In [17]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size = 0.2,random_state=3)

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(1000,) (800,) (200,)


##feature extracton

🔍 What this does:
TF-IDF (Term Frequency–Inverse Document Frequency) gives weight to important words and downplays common ones.

stop_words='english' removes common words like "is", "the", etc.

min_df=1 includes all words that appear in at least one document.

In [23]:
#transform the text data to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)


X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#convert y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [25]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3352 stored elements and shape (800, 75)>
  Coords	Values
  (0, 73)	0.38799412589421883
  (0, 6)	0.3110680787697231
  (0, 8)	0.38799412589421883
  (0, 51)	0.38799412589421883
  (0, 38)	0.38799412589421883
  (0, 64)	0.38799412589421883
  (0, 47)	0.38799412589421883
  (1, 69)	0.4472135954999579
  (1, 1)	0.4472135954999579
  (1, 12)	0.4472135954999579
  (1, 11)	0.4472135954999579
  (1, 27)	0.4472135954999579
  (2, 63)	0.6038754980263829
  (2, 31)	0.5202583644371096
  (2, 50)	0.6038754980263829
  (3, 63)	0.6038754980263829
  (3, 31)	0.5202583644371096
  (3, 50)	0.6038754980263829
  (4, 20)	0.3801764309964777
  (4, 45)	0.46245699295036624
  (4, 33)	0.46245699295036624
  (4, 24)	0.46245699295036624
  (4, 56)	0.46245699295036624
  (5, 71)	0.4284844234055891
  (5, 61)	0.5216643553406027
  :	:
  (792, 35)	0.5773502691896257
  (793, 32)	0.6214839337019458
  (793, 43)	0.6214839337019458
  (793, 66)	0.47698578626696087
  (794, 5)	0.5051

#training the model

In [26]:
model = LogisticRegression()

In [28]:
#training the logistic model with gtraining data
model.fit(X_train_features,Y_train)

evaluating the train model

In [29]:
#prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [31]:
print('accuracy on training data -->' ,accuracy_on_training_data)

accuracy on training data --> 1.0


In [33]:
#prediction_on_test_data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [34]:
print('accuracy on test data -->', accuracy_on_test_data)

accuracy on test data --> 1.0


##building predictive system

In [39]:
input_mail = ["Get rich quick! Join today!"]


#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction
prediction = model.predict(input_data_features)
print(prediction)


if prediction[0]==1 :
  print('spam mail')

else :
  print('genuine mail')

[1]
spam mail
