In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load data (assuming the file is saved as 'spam_data.csv' after extracting text from the image)
data = pd.read_csv('/content/sample_data/spam.csv')

# Make sure columns are properly named
data.columns = ['index', 'label', 'text', 'label_num']

# Encode labels if not already numeric
if data['label_num'].dtype != 'int':
    data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})
print("", data['label_num'])
# Split data
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_num'], test_size=0.3, random_state=42)
print("",X_train)
print("",y_test)
# Vectorize text data
vectorizer = CountVectorizer(max_features=200, min_df=2, max_df=0.8)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("X_train_vec",X_train_vec)
print("X_test_vec",X_test_vec)
# Train model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

 0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64
 1023    Subject: re : tenaska\r\ni see the demand fee ...
4586    Subject: strong buy alert : monthly newsletter...
2955    Subject: performance feedback\r\neach of you h...
2495    Subject: hr performance objectives binders\r\n...
3353    Subject: fw : [ fwd : fw : drawing by a school...
                              ...                        
4426    Subject: re : ena sales on hpl\r\nlast that i ...
466     Subject: tenaska iv\r\nbob :\r\ni understand f...
3092    Subject: broom , bristles up , flew\r\nbe diff...
3772    Subject: calpine daily gas nomination ( weeken...
860     Subject: re : meter 1459 , 6 / 00\r\nyep , you...
Name: text, Length: 3619, dtype: object
 1566    0
1988    1
1235    0
2868    0
4903    0
       ..
5135    0
2298    0
1519    0
1740    1
1700    0
Name: label_num, Length: 1552, dtype: int64
X_train_vec