# Spam or ham sms classification

Import necessary libraries

In [None]:
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Import dataset and create a dataframe

In [None]:
sms = pd.read_csv("/content/spam.csv", encoding = 'latin-1')
sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Drop unnecessary columns and rename the remaining columns of dataframe

In [None]:
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
sms.columns = ["label", "text"]
sms.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Check the shape of the dataframe

In [None]:
sms.shape 

(5572, 2)

Check the value counts of the labels ham and spam

In [None]:
sms['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

Check for missing values

In [None]:
sms.isnull().sum()

label    0
text     0
dtype: int64

Create variable 'stopwords' and save all the stopwords in it

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

Create a method to clean the text in sms for further processing

In [None]:
def clean_text(text):

  # removing punctuations
  text = "".join([char for char in text if char not in string.punctuation])

  # tokenizing the text and changing the case to lower case
  tokens = re.split('\W+', text.lower())

  # removing the tokens which are stopwords
  text = "".join([word for word in tokens if word not in stopwords])
  return text

In [None]:
tfidf = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf.fit_transform(sms['text'])
print(X_tfidf.shape)
print(tfidf.get_feature_names)

(5572, 52)
<bound method CountVectorizer.get_feature_names of TfidfVectorizer(analyzer=<function clean_text at 0x7fcd0d3099d8>, binary=False,
                decode_error='strict', dtype=<class 'numpy.float64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=None, min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)>


In [None]:
X_tfidf

<5572x53 sparse matrix of type '<class 'numpy.float64'>'
	with 97170 stored elements in Compressed Sparse Row format>

In [None]:
X_features = X_tfidf.toarray()

In [None]:
X_features

array([[0.56964792, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.52641743, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.45827385, 0.31408021, 0.31178676, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.30723179, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.48233694, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.37353314, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
rfc = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_features, sms['label'], test_size = 0.2)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [None]:
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print('Precision: {}' .format(round(precision, 3)))
print('Recall: {}' .format(round(recall, 3)))

Precision: 0.975
Recall: 0.897


In [None]:
print('Accuracy score: {}' .format(accuracy_score(y_test, y_pred)))

Accuracy score: 0.9802690582959641
