In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
train_df = pd.read_csv('train_data.txt', sep=":::", header=None, engine='python')
test_df = pd.read_csv('test_data_solution.txt', sep=":::", header=None, engine='python')
train_df.columns=['id','title','genre','description']
test_df.columns=['id','title','genre','description']

# Data Preprocessing

In [5]:
train_df.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54214 non-null  int64 
 1   title        54214 non-null  object
 2   genre        54214 non-null  object
 3   description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [7]:
train_df.isnull().sum()

id             0
title          0
genre          0
description    0
dtype: int64

In [9]:
train_df.drop(["id", "title"], axis=1)

Unnamed: 0,genre,description
0,drama,Listening in to a conversation between his do...
1,thriller,A brother and sister with a past incestuous r...
2,adult,As the bus empties the students for their fie...
3,drama,To help their unemployed father make ends mee...
4,drama,The film's title refers not only to the un-re...
...,...,...
54209,comedy,This short-lived NBC live sitcom centered on ...
54210,horror,The NEXT Generation of EXPLOITATION. The sist...
54211,documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,comedy,Walter and Vivian live in the country and hav...


In [12]:
train_df.loc[df['v1']=='spam', 'v1',]=0
df.loc[df['v1']=='ham', 'v1',]=1

In [24]:
X=df['v2']
y=df['v1']

# Split dataset into training and testing dataset

In [25]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=3)

Use TF-IDF for feature extraction

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
feature_extraction=TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_feat=feature_extraction.fit_transform(X_train)
X_test_feat=feature_extraction.transform(X_test)
 
y_train=y_train.astype('int')
y_test=y_test.astype('int')

# Logistic Regression

In [29]:
model = LogisticRegression()

In [30]:
model.fit(X_train_feat, y_train)

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
pred=model.predict(X_train_feat)
acc=accuracy_score(y_train, pred)

In [37]:
acc

0.9661207089970832

In [38]:
pred_test=model.predict(X_test_feat)
acc_test=accuracy_score(y_test, pred_test)

In [39]:
acc_test

0.9623318385650225

In [49]:
sms=["Congratulations! You've been selected as our lucky winner!You've just won an all-expenses-paid luxury vacation to a tropical paradise. Enjoy a week in a 5-star resort, with gourmet dining, spa treatments, and endless sunshine. But wait, there's more!Not only that, you've also won $10,000 in cash! This is your chance to live the life you've always dreamed of.To claim your prize, simply click the link below and provide your personal information, including your credit card details. Hurry, this offer won't last long!Don't miss out on this incredible opportunity. Act now and enjoy the rewards!"]
data_feat=feature_extraction.transform(sms)
prediction=model.predict(data_feat)
prediction

array([0])

In [50]:
if(prediction[0]==1):
    print("Not Spam")
else:
    print("Spam")

Spam


# Final Result

In [52]:
example = [input()]
data_feat = feature_extraction.transform(example)
prediction = model.predict(data_feat)
if prediction[0] == 1:
    print("This is a spam sms.")
else:
    print("This is not a spam sms.")

Congratulations! You've been selected as the lucky winner of our exclusive vacation giveaway! You and a guest have won an all-expenses-paid trip to a luxurious tropical paradise.
This is a spam sms.
