# NLP using Tfdif

## Import relevant packages 

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df=pd.read_csv('smsspamcollection.tsv',sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
len(df)

5572

In [4]:
print(df['label'].unique())
print(df['label'].value_counts())

['ham' 'spam']
ham     4825
spam     747
Name: label, dtype: int64


In [5]:
#check for missing values
df.isnull().sum()
#no missing values in the data

label      0
message    0
length     0
punct      0
dtype: int64

## Split the data into train and test 

In [6]:
X=df['message']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('X_train shape: ',X_train.shape)
print('X_test shape: ',X_test.shape)

X_train shape:  (3900,)
X_test shape:  (1672,)


## Pipeline of Tfidf vectorizer and linear SVC 

In [7]:
model=Pipeline([('tfidf',TfidfVectorizer()),
                ('clf',LinearSVC())
               ])
model.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [8]:
predictions=model.predict(X_test)

In [9]:
print(metrics.confusion_matrix(y_test,predictions))

[[1445    3]
 [  10  214]]


In [10]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [11]:
print('The accuracy of the model: ',(metrics.accuracy_score(y_test,predictions).round(4))*100,'%')

The accuracy of the model:  99.22 %
