In [1]:
import pandas as pd

messages = pd.read_csv('../08-bag_of_words/sms_spam_collection/SMSSpamCollection', sep='\t', names=['label', 'message'])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

In [5]:
corpus = []

for i in range(0, len(messages)):
    # any letters that are not a-z, substitute with a blank, in the 'message' column of the 'messages' var
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    # Lemmatize and filter stopwords
    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    # Remove list and turn back into sentences; append to empty list
    review = ' '.join(review)
    corpus.append(review)

In [6]:
## import model from sklearn
## TF-IDF gives higher importance to rare words and less to common ones
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Transform text to weighted numbers
tfidf = TfidfVectorizer(max_features=100)

X = tfidf.fit_transform(corpus).toarray()

In [10]:
# import Numpy to see TF-IDF values
import numpy as np

# Format array to display values
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))

In [12]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.434, 0, 0, 0.461, 0.544, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.456, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.473, 0, 0, 0, 0, 0, 0, 0, 0.492, 0, 0, 0, 0, 0, 0, 0, 0.571, 0, 0, 0, 0, 0, 0],
       [0.465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.485, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.574, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### TF-IDF with N-Gram

In [18]:
tfidf = TfidfVectorizer(max_features=100, ngram_range=(2, 3))

X = tfidf.fit_transform(corpus).toarray()

In [19]:
tfidf.vocabulary_

{'free entry': 31,
 'claim call': 18,
 'call claim': 4,
 'free call': 30,
 'chance win': 17,
 'txt word': 90,
 'let know': 53,
 'please call': 68,
 'lt gt': 59,
 'want go': 97,
 'like lt': 54,
 'like lt gt': 55,
 'sorry call': 82,
 'call later': 12,
 'sorry call later': 83,
 'ur awarded': 91,
 'hi hi': 45,
 'call customer': 5,
 'customer service': 23,
 'cash prize': 16,
 'call customer service': 6,
 'po box': 70,
 'trying contact': 88,
 'draw show': 27,
 'show prize': 80,
 'prize guaranteed': 74,
 'guaranteed call': 40,
 'valid hr': 95,
 'draw show prize': 28,
 'show prize guaranteed': 81,
 'prize guaranteed call': 75,
 'selected receive': 77,
 'private account': 71,
 'account statement': 0,
 'call identifier': 7,
 'identifier code': 47,
 'code expires': 22,
 'private account statement': 72,
 'account statement show': 1,
 'call identifier code': 8,
 'identifier code expires': 48,
 'urgent mobile': 94,
 'call landline': 11,
 'wat time': 98,
 'ur mob': 93,
 'gud ni': 42,
 'new year': 64,

In [17]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0