In [None]:
import pandas as pd 
data =  pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
text = data['message']
label = data['label']

In [None]:
label

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [None]:
#Number of Words
#x = lambda a : a + 10
#print(x(5))
data['word_count'] = data['message'].apply(lambda x: len(str(x).split(" ")))
data[['message','word_count']].head()

Unnamed: 0,message,word_count
0,"Go until jurong point, crazy.. Available only ...",20
1,Ok lar... Joking wif u oni...,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,U dun say so early hor... U c already then say...,11
4,"Nah I don't think he goes to usf, he lives aro...",13


In [None]:
#Number of characters
data['char_count'] = data['message'].str.len() ## this also includes spaces
data[['message','char_count']].head()

Unnamed: 0,message,char_count
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [None]:
#Average Word Length
def avg_word(sentence):
  words = sentence.split()
  #print(words)
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['message'].apply(lambda x: avg_word(x))
data[['message','avg_word']].head()

Unnamed: 0,message,avg_word
0,"Go until jurong point, crazy.. Available only ...",4.6
1,Ok lar... Joking wif u oni...,4.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,4.571429
3,U dun say so early hor... U c already then say...,3.545455
4,"Nah I don't think he goes to usf, he lives aro...",3.769231


In [None]:
#Number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['stopwords'] = data['message'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['message','stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,message,stopwords
0,"Go until jurong point, crazy.. Available only ...",4
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,5
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",5


In [None]:
#Number of special characters
data['hastags'] = data['message'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['message','hastags']].head()

Unnamed: 0,message,hastags
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#Number of numerics
data['numerics'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['message','numerics']].head()

Unnamed: 0,message,numerics
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#Number of Uppercase words
data['upper'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['message','upper']].head()

Unnamed: 0,message,upper
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",1


In [None]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

data['noun_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['message','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,message,noun_count,verb_count,adj_count,adv_count,pron_count
0,"Go until jurong point, crazy.. Available only ...",9,1,3,3,0
1,Ok lar... Joking wif u oni...,4,1,1,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,14,3,4,0,0
3,U dun say so early hor... U c already then say...,3,3,2,3,0
4,"Nah I don't think he goes to usf, he lives aro...",1,5,0,3,3


In [None]:
data[['message','word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count','label' ]].head()

Unnamed: 0,message,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count,label
0,"Go until jurong point, crazy.. Available only ...",20,111,4.6,4,0,0,0,9,1,3,3,0,ham
1,Ok lar... Joking wif u oni...,6,29,4.0,0,0,0,0,4,1,1,0,0,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,28,155,4.571429,5,0,2,2,14,3,4,0,0,spam
3,U dun say so early hor... U c already then say...,11,49,3.545455,2,0,0,2,3,3,2,3,0,ham
4,"Nah I don't think he goes to usf, he lives aro...",13,61,3.769231,5,0,0,1,1,5,0,3,3,ham


In [None]:
features = data[['word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count']]

In [None]:
#label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = data['label'].apply(classes_list.index)
label = np.asarray(label_index)
label

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
import numpy as np
features_array = np.asarray(features)


In [None]:
features_array.shape
features.head()

Unnamed: 0,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count
0,20,111,4.6,4,0,0,0,9,1,3,3,0
1,6,29,4.0,0,0,0,0,4,1,1,0,0
2,28,155,4.571429,5,0,2,2,14,3,4,0,0
3,11,49,3.545455,2,0,0,2,3,3,2,3,0
4,13,61,3.769231,5,0,0,1,1,5,0,3,3


In [None]:
from subprocess import check_output
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)

%matplotlib inline
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as pltt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
#from sklearn.cluster import divi
from pylab import rcParams
rcParams['figure.figsize'] = 9, 8  # set plot size

In [None]:
k=2
model=KMeans(n_clusters=k)
model.fit(features_array)
clusassign=model.predict(features_array)

In [None]:
type(clusassign)
type(label)

numpy.ndarray

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics


print("k-means")
print("Accuracy score =", accuracy_score(label, clusassign))
print(metrics.confusion_matrix(label,clusassign))

print(metrics.classification_report(label, clusassign))


k-means
Accuracy score = 0.7873295046661881
[[3715 1110]
 [  75  672]]
              precision    recall  f1-score   support

           0       0.98      0.77      0.86      4825
           1       0.38      0.90      0.53       747

    accuracy                           0.79      5572
   macro avg       0.68      0.83      0.70      5572
weighted avg       0.90      0.79      0.82      5572

