In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data collection pre processing


In [2]:
mail_data = pd.read_csv('/content/mail_data.csv')

In [3]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
mail_data.replace({'Category':{'ham':0, 'spam':1}}, inplace=True)

In [6]:
mail_data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [7]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

# Train test split

In [8]:
X = mail_data['Message']
Y = mail_data['Category']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)

In [10]:
feature_extraction = TfidfVectorizer(min_df= 1, stop_words='english', lowercase='True')

In [11]:
X_Train_features = feature_extraction.fit_transform(X_train)
X_Test_features = feature_extraction.transform(X_test)



Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

# LGBM

97.31%

In [13]:
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_std = scaler.fit_transform(X_train)
# X_test_std = scaler.transform(X_test)

modelc = lgbm.LGBMClassifier()
modelr = lgbm.LGBMRegressor()

# modelr.fit(X_train_std, Y_train)
# predictionr = modelr.predict(X_test_std)
# r2_score_error = metrics.r2_score(Y_test, predictionr)
# print(r2_score_error)

modelc.fit(X_Train_features, Y_train)
predictionc = modelc.predict(X_Test_features)
accuracy_predictionc = accuracy_score(Y_test, predictionc)
print(accuracy_predictionc)

0.9739910313901345


# Logistic Regression

In [14]:
model = LogisticRegression()

In [15]:
model.fit(X_Train_features, Y_train)
prediction = model.predict(X_Test_features)
prediction_accuracy = accuracy_score(Y_test, prediction)
print(prediction_accuracy)

0.9641255605381166


# Predictive System

In [17]:
input_mail = ["Nah I don't think he goes to usf, he lives around here though"]

input_data_features = feature_extraction.transform(input_mail)

predict = modelc.predict(input_data_features)
if (predict) == 0:
  print('Not a spam email')
  print('Precision is: % .2f ' % (accuracy_predictionc * 100))
else:
  print('Spam email')
  print('Precision is: % .2f ' % (accuracy_predictionc * 100))


Not a spam email
Precision is:  97.40 
