In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df=pd.read_csv('/content/mail_data.csv')
df.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape

(5572, 2)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
# replace the null values with a null string
data=df.where((pd.notnull(df)),'')

In [None]:
# printing the first 5 rows of df
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label encoding


In [None]:
# label spam mail as 0 , label ham mail as 1
data.loc[data['Category']=='spam','Category'] = 0
data.loc[data['Category']=='ham','Category'] = 1
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# seprating the data as text and label
X=data['Message']
Y=data['Category']

In [None]:
# splitting the x and y into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(4457,)
(4457,)
(1115,)
(1115,)


 **feature extraction**

In [None]:
# transform the text data to vactor data that can be used as input to the logistic regression
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert the Y_train and Y_test values as integers

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')



In [None]:
model=LogisticRegression()

In [None]:
model.fit(X_train_features,Y_train)

**evaluating the trained model**

In [None]:
# predicting on training data
predicting_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,predicting_on_training_data)
print(accuracy_on_training_data)

0.9676912721561588


In [None]:
# predicting on testing data
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)
print(accuracy_on_test_data)


0.9668161434977578


building a predictive system

In [None]:
input_mail=input("enter the mail")
input_mail_features=feature_extraction.transform([input_mail])  #this line help to convert array
prediction=model.predict(input_mail_features)
if prediction==1:
  print("your mail is ham mail\n")
else:
  print("spam mail\n")

enter the mail"Win a lottery now! Click here!"
your mail is ham mail

