In [5]:
import pandas as pd

In [6]:
raw_mail_data = pd.read_csv("../dataset/mail_data.csv")

In [7]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [9]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [10]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
df.shape

(5572, 2)

In [8]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'non-spam', 'Category',] = 1
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [10]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
X = df['Message']
Y = df['Category']

In [12]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [13]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
y_train

1978    0
3989    1
3935    1
4078    1
4086    0
       ..
3772    1
5191    1
5226    1
5390    1
860     1
Name: Category, Length: 4457, dtype: object

In [175]:
y_train.shape

(4457,)

In [31]:
X_test.shape

(209715,)

In [15]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english")
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [16]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [72]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5144148 stored elements and shape (838860, 79)>
  Coords	Values
  (0, 71)	0.4202756699064727
  (0, 3)	0.3418213077374314
  (0, 59)	0.4202756699064727
  (0, 69)	0.4202756699064727
  (0, 72)	0.4202756699064727
  (0, 30)	0.4202756699064727
  (1, 27)	0.3738254726361271
  (1, 32)	0.3738254726361271
  (1, 33)	0.2629561583829669
  (1, 74)	0.3738254726361271
  (1, 10)	0.3738254726361271
  (1, 46)	0.3738254726361271
  (1, 64)	0.3738254726361271
  (1, 75)	0.3039426834433246
  (2, 33)	0.3426462058989496
  (2, 54)	0.48742064158499954
  (2, 18)	0.48742064158499954
  (2, 12)	0.39633484536959446
  (2, 11)	0.30500865331530924
  (2, 42)	0.3966411799667689
  (3, 75)	0.3417225571074325
  (3, 19)	0.420291730578644
  (3, 24)	0.420291730578644
  (3, 56)	0.420291730578644
  (3, 48)	0.420291730578644
  :	:
  (838856, 14)	0.37654260785971744
  (838856, 77)	0.37654260785971744
  (838856, 0)	0.37654260785971744
  (838856, 26)	0.37654260785971744
  (83

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

model1 = LogisticRegression()
model2 = MultinomialNB()


In [18]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Loại chuẩn hóa
    'C': [0.01, 0.1, 1, 10],                       # Hệ số điều chuẩn
    'solver': ['liblinear', 'saga'],               # Thuật toán tối ưu
    'max_iter': [100, 200]                         # Số vòng lặp tối đa
}
grid_search = GridSearchCV(
    estimator=model1,
    param_grid=param_grid,
    cv=5,                  # Số lần chia cross-validation
    scoring='accuracy',    # Tiêu chí đánh giá
    verbose=1,             # Hiển thị tiến trình
    n_jobs=-1              # Sử dụng đa luồng
)
grid_search.fit(X_train_features, y_train)


Fitting 5 folds for each of 64 candidates, totalling 320 fits


160 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Duy Lam\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Duy Lam\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Duy Lam\Ap

In [19]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.9717287473640239


In [22]:
model = LogisticRegression(penalty='l2', C=10, max_iter=100, solver='liblinear')
model.fit(X_train_features, y_train)
model1.fit(X_train_features, y_train)
model2.fit(X_train_features, y_train)

In [33]:
prediction_test_data = model1.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

#bayes
prediction_test_data2 = model2.predict(X_train_features)
accuracy_test_data2 = accuracy_score(y_train, prediction_test_data2)

prediction_test_data3 = model.predict(X_test_features)
accuracy_test_data3 = accuracy_score(y_test, prediction_test_data3)


In [34]:
print("Accuarcy on test data1: ", accuracy_test_data)
print("Accuarcy on test data2: ", accuracy_test_data2)
print("Accuarcy on test data3: ", accuracy_test_data3)

Accuarcy on test data1:  0.967713004484305
Accuarcy on test data2:  0.9813776082566749
Accuarcy on test data3:  0.9874439461883409


In [27]:
input_user_mail = [
    "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode"]

input_data_features = feature_extraction.transform(input_user_mail)

prediction = model1.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

This is a spam mail


In [29]:
import pickle

pickle.dump(model1, open("logistic_regression.pkl", "wb"))
pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))