## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data Collection and Analysis

In [75]:
mail_data = pd. read_csv("mail_data.csv")

In [76]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [77]:
mail_data.shape

(5572, 2)

In [78]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [79]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [80]:
mail_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

## Data Preprocessing

### Label Encoding

Replace categorical column Category with numbers as spam mail with 0 and ham mail with 1

In [82]:
mail_data.replace({'Category':{'spam':0,'ham':1}}, inplace = True)

In [83]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


### Handling the UnBalanced Data 
The Above Data is unbalanced as we have 70% data in ham category and just 20% in Spam, This will affect the model Performance. 
Under Sampling: Build a sample dataset containing similar distribution of Spam and Ham, </br>
    Number of Spam = 747

Randomly choosing 747 data from 4825 Ham category in order to make the spam and ham dataset equal for further analysis

In [84]:
spam = mail_data[mail_data.Category == 0 ]
ham = mail_data[mail_data.Category == 1 ]

In [85]:
print(spam.shape)
print(ham.shape)

(747, 2)
(4825, 2)


In [86]:
print(ham)
print(spam)

      Category                                            Message
0            1  Go until jurong point, crazy.. Available only ...
1            1                      Ok lar... Joking wif u oni...
3            1  U dun say so early hor... U c already then say...
4            1  Nah I don't think he goes to usf, he lives aro...
6            1  Even my brother is not like to speak with me. ...
...        ...                                                ...
5565         1                                       Huh y lei...
5568         1               Will ü b going to esplanade fr home?
5569         1  Pity, * was in mood for that. So...any other s...
5570         1  The guy did some bitching but I acted like i'd...
5571         1                         Rofl. Its true to its name

[4825 rows x 2 columns]
      Category                                            Message
2            0  Free entry in 2 a wkly comp to win FA Cup fina...
5            0  FreeMsg Hey there darling it's been

In [87]:
# Randomly Choosing 747 rows from ham dataset
ham_sample = ham.sample(n=747)

In [88]:
# Concatenating the ham_sample and spam data
new_mail_data = pd.concat((ham_sample,spam), axis =0)

In [89]:
new_mail_data['Category'].value_counts()

1    747
0    747
Name: Category, dtype: int64

In [90]:
new_mail_data.shape

(1494, 2)

### Separating the target and features

In [91]:
X = new_mail_data['Message']
Y = new_mail_data['Category']

In [92]:
print(X)
print(Y)

742     Do well :)all will for little time. Thing of g...
4933                Match started.india  &lt;#&gt;  for 2
1184               Am i that much bad to avoid like this?
1192    Come to my home for one last time i wont do an...
4239         Lol wtf random. Btw is that your lunch break
                              ...                        
5537    Want explicit SEX in 30 secs? Ring 02073162414...
5540    ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547    Had your contract mobile 11 Mnths? Latest Moto...
5566    REMINDER FROM O2: To get 2.50 pounds free call...
5567    This is the 2nd time we have tried 2 contact u...
Name: Message, Length: 1494, dtype: object
742     1
4933    1
1184    1
1192    1
4239    1
       ..
5537    0
5540    0
5547    0
5566    0
5567    0
Name: Category, Length: 1494, dtype: int64


## Model Building

### Split the data into training and test set

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [94]:
print(X.shape, X_train.shape, X_test.shape)

(1494,) (1195,) (299,)


### Feature Extraction

In [95]:
# Tranform the text data to feature vectors which can be used as input to the Logistic Regression
feature_vector = TfidfVectorizer(min_df=1, 
                                 stop_words='english', 
                                 lowercase=True)

X_train_features = feature_vector.fit_transform(X_train)
X_test_features = feature_vector.transform(X_test)

# Convert Y_train and Y_test value to integer from string type
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [96]:
print(X_train_features)

  (0, 3846)	0.20946742144859595
  (0, 309)	0.16539436563655968
  (0, 1680)	0.276466089705327
  (0, 3427)	0.26130466724078033
  (0, 294)	0.14806014378319832
  (0, 1221)	0.25054746680923484
  (0, 3564)	0.11841349573880157
  (0, 634)	0.26130466724078033
  (0, 3422)	0.1324987959158566
  (0, 3843)	0.23538604434468813
  (0, 2041)	0.13330297032308397
  (0, 3716)	0.276466089705327
  (0, 3758)	0.26130466724078033
  (0, 1711)	0.20112348855755302
  (0, 2653)	0.16262327078438882
  (0, 3293)	0.25054746680923484
  (0, 2666)	0.4189348428971919
  (0, 1896)	0.276466089705327
  (1, 3482)	0.3436068340658076
  (1, 733)	0.5421974866483221
  (1, 2152)	0.5421974866483221
  (1, 3375)	0.5421974866483221
  (2, 3743)	0.2973536588139147
  (2, 976)	0.2810467603893678
  (2, 1320)	0.21372317786286213
  :	:
  (1192, 3714)	0.3193881414289321
  (1193, 3783)	0.2992599180715869
  (1193, 2800)	0.2992599180715869
  (1193, 1071)	0.2992599180715869
  (1193, 892)	0.2992599180715869
  (1193, 772)	0.28284848023693254
  (1193, 3

### Model Training

#### Logistic Regression

In [97]:
model = LogisticRegression()

In [98]:
# Training the logistic Regression Model with the training data
model.fit(X_train_features, Y_train)

### Model Evaluation

In [99]:
# EValuating the trained data
trained_predict = model.predict(X_train_features)
trained_accuracy = accuracy_score(Y_train, trained_predict)
print("Accuarcy Score of Trained Data: ", trained_accuracy)

Accuarcy Score of Trained Data:  0.9841004184100418


In [100]:
# EValuating the test data
test_predict = model.predict(X_test_features)
test_accuracy = accuracy_score(Y_test, test_predict)
print("Accuarcy Score of Test Data: ", test_accuracy)

Accuarcy Score of Test Data:  0.9565217391304348


## Build a predictive System

In [101]:
#input_mail = ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]
input_mail = ["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"]

# Transform text to feature vectors
input_feature = feature_vector.transform(input_mail)

# evaluation
prediction = model.predict(input_feature)

print(prediction)
if prediction[0] == 0:
    print("Spam Mail")
else:
    print("Not a Spam mail")

[0]
Spam Mail
