# Working with Datasets using Python
Using default datasets, datasets from code or loading from a file

## Exercise 1: Classification using Numerical Data

In [33]:
# load the default iris dataset which comes preloaded with Anaconda 
from sklearn.datasets import load_iris
iris = load_iris() # default is a numpy() array

In [34]:
# Output the shape of the Iris dataset 
print(iris.data.shape) # feature matrix (features are also predictors, inputs or attributes)
print(iris.target.shape) # response vector (response is also target, label or output)

(150, 4)
(150,)


In [35]:
# Load the iris dataset of 150 records into a DataFrame 
# DataFrame is similar to an Excel spreadsheet or database table
import pandas as pd
pd.DataFrame(iris.data, columns=iris.feature_names).head(5)
# Columns represent features (also known as predictors, inputs or attributes)
# Rows represent observations (also known as samples, instances or records)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [36]:
# Display the response vectors (also known as targets, labels or outputs)
print(iris.target)
# This is a classification dataset where responses represent 3 classes (0, 1, 2)
# This is not a regression problem (values would be plotted on an x and y axis)
# To build a model, features must be numeric because ML models do math & can't deal with raw text
# and every observation must have same features in the same order

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [40]:
# Scikit-learn modeling pattern includes 4 steps:
# 1) Import
# 2) Instatiate
# 3) Fit
# 4) Predict
# Here are the first 3 steps:

# import the class
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()

# fit the model with data (occurs in-place)
knn.fit(iris.data, iris.target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [41]:
# predict the response for a new observation
# must have the same features as the training observation, both in number and in meaning
knn.predict([[3, 5, 4, 2]])

array([1])

## Exercise 2 - Classification by representing Text as Numerical Data

In [67]:
# example text for model training (Tweets messages)
simple_train = ['Where is my cat?', 'Im ready for summer', 'Who wants dinner?', 'Where is the game?']

In [68]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [69]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [70]:
# examine the fitted vocabulary
vect.get_feature_names()

['cat',
 'dinner',
 'for',
 'game',
 'im',
 'is',
 'my',
 'ready',
 'summer',
 'the',
 'wants',
 'where',
 'who']

In [71]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [72]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [73]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cat,dinner,for,game,im,is,my,ready,summer,the,wants,where,who
0,1,0,0,0,0,1,1,0,0,0,0,1,0
1,0,0,1,0,1,0,0,1,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0,0,0,1,0,1,0


In [74]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [75]:
# examine the sparse matrix contents
print(simple_train_dtm)

  (0, 0)	1
  (0, 5)	1
  (0, 6)	1
  (0, 11)	1
  (1, 2)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (2, 1)	1
  (2, 10)	1
  (2, 12)	1
  (3, 3)	1
  (3, 5)	1
  (3, 9)	1
  (3, 11)	1


In [76]:
# example text for model testing
simple_test = ["where is up?"]

In [77]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [78]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cat,dinner,for,game,im,is,my,ready,summer,the,wants,where,who
0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [89]:


# fit the model with data (occurs in-place)
myarray = [1, 0, 1, 1]
knn.fit(simple_train_dtm, myarray)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [90]:
knn.predict([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]) 

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 5