In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
# data collection and pre processing
mail_data = pd.read_csv('/content/mail_data.csv')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [None]:
mail_data.shape

(5572, 2)

In [None]:
# let's check for any missing values
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [None]:
# in case of null values , replace the null values with null strings
mail_dataset = mail_data.where((pd.notnull(mail_data)),'')

In [None]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_dataset.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [None]:
# label encoding , 0 for spam mail and 1 for ham mail
# easy method to do so
# mail_dataset.replace({'Category':{'spam':0,'ham':1}},inplace=True)

In [None]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#second method for encoding
mail_dataset.loc[mail_dataset['Category']=='spam','Category',]=0
mail_dataset.loc[mail_dataset['Category']=='ham','Category',]=1

In [None]:
# separating the data as output label and features
X = mail_dataset['Message']
Y = mail_dataset['Category']

In [None]:
# splitting the data into training and test data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [None]:
# now we need to convert the message column into numerical values
# we'll do feature extraction i.e. the code will see each of the word in the message and update its count

feature_extraction = TfidfVectorizer(min_df= 1,stop_words='english',lowercase=True)
# min_df - min score should be 1 , otherwise ignore
# stop_words - is,am,are,of common words of no significance , just ignore them
# for better processing , all the characters are made lowercase

In [None]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [None]:
print(Y_test)

5086    1
2120    1
2318    1
2917    1
1352    1
       ..
884     1
3821    1
1066    1
208     1
1378    0
Name: Category, Length: 1115, dtype: int64


In [None]:
# we can see the datatype for Y_train and Y_test is int64 , thats ok , but in some case it might be object , so we need to perform the below task
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train_features)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

In [None]:
print(Y_train)

3890    1
5553    1
4366    1
3968    0
3771    1
       ..
3335    1
1099    1
2514    0
3606    1
2575    0
Name: Category, Length: 4457, dtype: int64


In [None]:
# making and train the model
# And its clear that this project is an example of supervised learning
model = LogisticRegression()
model.fit(X_train_features,Y_train)


In [None]:
# training accuracy //
train_prediction = model.predict(X_train_features)
train_accuracy_score = accuracy_score(train_prediction,Y_train)
print('train accuracy : ',train_accuracy_score)


train accuracy :  0.9683643706529056


In [None]:
# test accuracy //
test_prediction = model.predict(X_test_features)
test_accuracy_score = accuracy_score(test_prediction,Y_test)
print('test accuracy : ', test_accuracy_score)


test accuracy :  0.9524663677130045


In [None]:
# building a predictive system //
input_data = ["Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES"]

# convert this text to feature vector
input_data_new = feature_extraction.transform(input_data)
prediction = model.predict(input_data_new)
print(prediction)

if prediction[0]==0:
  print('Spam')
else :
  print('ham')

print('The correct category is : Spam')

[1]
ham
The correct category is : Spam


In [None]:
print("model works well ")

model works well 


In [None]:
print('20+ LPA')

20+ LPA
