In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

file_path = "D:\\spam\\spam.csv" 
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'], encoding='ISO-8859-1')

print("First 5 rows of the dataset:")
print(df.head()) 
print("\nDataset Info:")
print(df.info())  

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['message'])
y = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
print("\nNaive Bayes Strength/Limitations:")
print("Strength: Naive Bayes works well for high-dimensional data, such as text classification tasks, "
      "and is computationally efficient. It's effective in tasks like spam detection.")
print("Limitation: Naive Bayes assumes independence of features, which might not always hold in text data. "
      "It doesn't consider feature dependencies, which can limit its performance in complex text classification tasks.")


First 5 rows of the dataset:
                                               label  message
0                                           v1,v2,,,      NaN
1  ham,"Go until jurong point, crazy.. Available ...      NaN
2               ham,Ok lar... Joking wif u oni...,,,      NaN
3  spam,Free entry in 2 a wkly comp to win FA Cup...      NaN
4  ham,U dun say so early hor... U c already then...      NaN

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5575 entries, 0 to 5574
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   label    5575 non-null   object 
 1   message  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 87.2+ KB
None


AttributeError: 'float' object has no attribute 'lower'