# Working with Datasets using Python
Using default datasets, datasets from code or loading from a file

## Exercise 2 - Classification by representing Text as Numerical Data

In [62]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
# SOLUTION 1
# knn = KNeighborsClassifier(n_neighbors=2)
# SOLUTION 2
knn = KNeighborsClassifier()

In [63]:
# example text for model training (Tweets messages)
# Solution 1
# simple_train = ['Where is my cat?', 'Im ready for summer', 'Who wants dinner?', 'Where is the game?']
# Solution 2
simple_train = ['Where is my cat?', 'Im ready for summer', 'Who wants dinner?', 'Where is the game?', 'The cake is a lie']


In [64]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [65]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [66]:
# examine the fitted vocabulary
vect.get_feature_names()

[u'cake',
 u'cat',
 u'dinner',
 u'for',
 u'game',
 u'im',
 u'is',
 u'lie',
 u'my',
 u'ready',
 u'summer',
 u'the',
 u'wants',
 u'where',
 u'who']

In [67]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<5x15 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [68]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [69]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cake,cat,dinner,for,game,im,is,lie,my,ready,summer,the,wants,where,who
0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0
4,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0


In [70]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [71]:
# examine the sparse matrix contents
print(simple_train_dtm)

  (0, 1)	1
  (0, 6)	1
  (0, 8)	1
  (0, 13)	1
  (1, 3)	1
  (1, 5)	1
  (1, 9)	1
  (1, 10)	1
  (2, 2)	1
  (2, 12)	1
  (2, 14)	1
  (3, 4)	1
  (3, 6)	1
  (3, 11)	1
  (3, 13)	1
  (4, 0)	1
  (4, 6)	1
  (4, 7)	1
  (4, 11)	1


In [72]:
# example text for model testing
simple_test = ["where is up?"]

In [73]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]])

In [74]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cake,cat,dinner,for,game,im,is,lie,my,ready,summer,the,wants,where,who
0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [75]:
# fit the model with data (occurs in-place)
# Solution 1
# mytarget = [1, 0, 1, 1] #1 = question; 0 = not question
# Solution 2
mytarget = [1, 0, 1, 1, 0]
knn.fit(simple_train_dtm, mytarget)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [76]:
# Solution 1
# knn.predict([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]])
# Solution 2
knn.predict([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]])

array([1])