# Vectorizing Raw Data: Count Vectorization

### Count vectorization 

Creates a document-term matrix where the entry of each cell will be a count of the number of times that word occurred in that document.

### Read in text

In [1]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']



### Create function to remove punctuation, tokenize, remove stopwords, and stem

In [2]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Apply CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data["body_text"])
print(X_counts)

  (0, 3134)	1
  (0, 2790)	2
  (0, 436)	1
  (0, 7816)	1
  (0, 2120)	1
  (0, 7782)	1
  (0, 2909)	2
  (0, 2288)	1
  (0, 3011)	1
  (0, 7168)	1
  (0, 456)	1
  (0, 4640)	1
  (0, 443)	1
  (0, 7027)	1
  (0, 879)	1
  (0, 5917)	1
  (0, 5829)	1
  (0, 7350)	1
  (0, 5876)	1
  (0, 1228)	1
  (0, 73)	1
  (1, 4931)	1
  (1, 2586)	1
  (1, 7095)	1
  (1, 3332)	1
  :	:
  (5563, 3320)	1
  (5563, 8101)	1
  (5563, 3123)	1
  (5563, 2818)	1
  (5564, 6830)	1
  (5564, 4833)	1
  (5564, 5528)	1
  (5564, 6528)	1
  (5565, 3134)	1
  (5565, 4369)	1
  (5565, 7693)	1
  (5565, 5015)	1
  (5565, 7473)	1
  (5565, 6550)	1
  (5565, 1776)	1
  (5565, 2748)	1
  (5565, 3239)	1
  (5565, 3462)	1
  (5565, 3801)	1
  (5565, 3916)	1
  (5565, 997)	1
  (5565, 1564)	1
  (5566, 4937)	1
  (5566, 7306)	1
  (5566, 6070)	1


### Apply CountVectorizer to smaller sample

In [8]:
data_sample = data[0:20]

### Vectorizers output sparse matrices

_**Sparse Matrix**: A matrix in which most entries are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements._