## **NLP: Document Classification**

**Submitted by:** Euclides Rodriguez 

**Course:** CUNY DATA 620

**Data Source:** https://archive.ics.uci.edu/dataset/94/spambase

In [60]:
import nltk
from nltk.probability import FreqDist
import random
import pandas as pd 

In [61]:
#Import Data 

df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['Class', 'Text'])

In [62]:
df.head(10)

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [63]:
# Convert text to lowercase
df['Text'] = df['Text'].str.lower()

# Tokenize text column
df['Text'] = df['Text'].str.split()


In [64]:
documents = list(zip( df['Text'],df['Class']))

In [65]:
documents[0]

(['go',
  'until',
  'jurong',
  'point,',
  'crazy..',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet...',
  'cine',
  'there',
  'got',
  'amore',
  'wat...'],
 'ham')

In [66]:
random.shuffle(documents)

In [67]:
# Flatten all tokens into a single list
all_tokens = [token for tokens in df['Text'] for token in tokens]

# Compute frequency distribution
fdist = FreqDist(all_tokens)

print(fdist.most_common(10))

[('to', 2237), ('i', 2217), ('you', 1921), ('a', 1433), ('the', 1329), ('u', 998), ('and', 968), ('is', 869), ('in', 859), ('my', 755)]


In [68]:
len(all_tokens)

86909

In [73]:
# Get top 2000 most frequent words
top_2000 = fdist.most_common(2000)

In [76]:
# Extract the top 2000 words into a list  
word_features = [word for word, freq in top_2000]

In [82]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
    

In [93]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[1671:], featuresets[:3900]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [94]:
len(featuresets)

5572

In [95]:
round(nltk.classify.accuracy(classifier, test_set),4)

0.9849

In [96]:
 classifier.show_most_informative_features(10)

Most Informative Features
        contains(latest) = True             spam : ham    =    118.5 : 1.0
         contains(video) = True             spam : ham    =     96.9 : 1.0
          contains(now!) = True             spam : ham    =     95.1 : 1.0
            contains(16) = True             spam : ham    =     92.6 : 1.0
       contains(network) = True             spam : ham    =     79.7 : 1.0
         contains(await) = True             spam : ham    =     75.4 : 1.0
          contains(draw) = True             spam : ham    =     71.1 : 1.0
       contains(service) = True             spam : ham    =     65.6 : 1.0
           contains(txt) = True             spam : ham    =     63.5 : 1.0
        contains(pounds) = True             spam : ham    =     62.5 : 1.0
