In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Preprocessing

In [None]:
# Loading the data
raw_mail_data = pd.read_csv('/content/mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
raw_mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [None]:
# Replace null values with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [None]:
# Checking number of rows and columns
mail_data.shape

(5572, 2)

In [None]:
# Label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [None]:
 # Separating the data as label and texts
x = mail_data['Message']
y = mail_data['Category']

In [None]:
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [None]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [None]:
# Splitting the data as training and test data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 3)


In [None]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [None]:
# Feature extraction: transform the text data to feature vectors which can be used for logistic regression
feature_extraction = TfidfVectorizer(min_df = 1,stop_words = 'english', lowercase = 'True')
fx_train = feature_extraction.fit_transform(x_train)
fx_test = feature_extraction.transform(x_test)

In [None]:
# Convert y_test and y_train as integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
# Training the machine learning model
model = LogisticRegression()

In [None]:
model.fit(fx_train,y_train)

LogisticRegression()

In [None]:
# Model evaluation for training data
prediction_train = model.predict(fx_train)
accuracy_train = accuracy_score(y_train,prediction_train)
print(prediction_train)
accuracy_train

[1 1 1 ... 1 1 0]


0.9670181736594121

In [None]:
# Model evaluation for test data
prediction_test = model.predict(fx_test)
accuracy_test = accuracy_score(y_test,prediction_test)
print(prediction_test)
accuracy_test

[0 1 1 ... 1 1 1]


0.9659192825112107

In [None]:
# Predictive Model
input = ["Hello, everyone! Hope the summer's been fun for you, full of sleep and/or great opportunities ('cause why do these two things always need to be mutually exclusive?). For the coming semester, our club had a couple of things in mind to help you along the placement prep and career discovery/investigation process. But before we reveal any specifics about the event itself, we want to know from you what fields you find most interesting, as a possible career path or just something you want to begin to explore. "]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input)

# making prediction
prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail') 

[1]
Ham mail
