### Dependancies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_data = pd.read_csv("spam.csv")

In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
raw_data.isna().sum()

Category    0
Message     0
dtype: int64

In [5]:
raw_data.loc[raw_data['Category'] == 'spam', 'Category'] = 0
raw_data.loc[raw_data['Category'] == 'ham', 'Category'] = 1

In [6]:
raw_data.head(20)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(raw_data.Message, raw_data.Category, test_size=0.4, random_state=3)

In [8]:
Y_train.shape

(3343,)

In [9]:
Y_test.shape

(2229,)

In [10]:
feature_extraction = TfidfVectorizer(min_df=10, stop_words="english", lowercase=1)

X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)



In [76]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [77]:
print(X_train_feature)

  (0, 467)	0.5874783690622778
  (0, 131)	0.809239869188318
  (1, 77)	0.23719473543344657
  (1, 497)	0.29144569047904656
  (1, 249)	0.18698287905940753
  (1, 248)	0.2437179602985452
  (1, 201)	0.521207236727189
  (1, 136)	0.29887887966435067
  (1, 406)	0.2701067484175382
  (1, 470)	0.5156139382825599
  (1, 58)	0.2542362455124458
  (2, 228)	0.24421436676111594
  (2, 474)	0.34097604903224193
  (2, 404)	0.7517680836158547
  (2, 422)	0.3962879422314371
  (2, 306)	0.31921042018134016
  (3, 262)	0.29016338785793916
  (3, 305)	0.34673819194906935
  (3, 210)	0.3124602835794475
  (3, 74)	0.41941467727272236
  (3, 47)	0.3423703028401669
  (3, 86)	0.3768778192815281
  (3, 239)	0.3547441498076447
  (3, 404)	0.3700540602511294
  (4, 429)	0.2890897799422309
  :	:
  (3339, 434)	0.34318569636022306
  (3339, 132)	0.2730903718771488
  (3339, 476)	0.3257679920476533
  (3339, 481)	0.2625207541420182
  (3339, 94)	0.4959694216675095
  (3339, 201)	0.34318569636022306
  (3340, 184)	0.7742892704389363
  (3340, 

In [55]:
model = LogisticRegression()

model.fit(X_train_feature, Y_train)

In [78]:
prediction = model.predict(X_train_feature)
accuracy = accuracy_score(Y_train, prediction)

print("Model Accuracy :",accuracy)

Model Accuracy : 0.9727789410708944


In [80]:
prediction_test = model.predict(X_test_feature)
accuracy_test = accuracy_score(Y_test, prediction_test)

print("Test Model Accuracy :",accuracy_test)

Test Model Accuracy : 0.9676985195154778


In [90]:
user_input = input("Your Mail : ")

# print(list(user_input))
user_input = feature_extraction.transform([user_input])

result =  model.predict(user_input)


if result[0] == 1:
    print("Your mail is ham mail.")
else:
    print("Alert!! Changes to your mail is spam.")

Your Mail : 'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
Alert!! Changes to your mail is spam.
