# Guessing the number: linear regression

## Using more variables

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import scale
housing = fetch_california_housing()
X = scale(housing.data)
y = housing.target

In [3]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression(normalize=True)
regression.fit(X, y)

LinearRegression()

In [5]:
print(regression.score(X, y))

0.606232685199805


In [6]:
print([a + ':' + str(round(b, 2)) for a, b in zip(
    housing.feature_names, regression.coef_,)])

['MedInc:0.83', 'HouseAge:0.12', 'AveRooms:-0.27', 'AveBedrms:0.31', 'Population:-0.0', 'AveOccup:-0.04', 'Latitude:-0.9', 'Longitude:-0.87']


In [8]:
print (housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

## Understanding limitations and potential problems

# Moving to Logistic Regression

## Applying logistic regression

In [21]:
from sklearn.datasets import load_iris
import numpy as np
np.set_printoptions(suppress=True)
iris = load_iris()
X, y = load_iris(return_X_y=True)

In [22]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(max_iter=200)
logistic.fit(X, y)
single_row_pred = logistic.predict(
iris.data[-1, :].reshape(1, -1))
single_row_pred_proba = logistic.predict_proba(
iris.data[-1, :].reshape(1, -1))
print ('Predicted class %s, real class %s'
       % (single_row_pred, iris.target[-1]))
rounded_single_row_pred_proba = np.round(single_row_pred_proba, 4)
print ('Probabilities for each class from 0 to 2: %s'
       % rounded_single_row_pred_proba)


Predicted class [2], real class 2
Probabilities for each class from 0 to 2: [[0.0005 0.2348 0.7647]]


## Considering when classes are more than two

In [23]:
from sklearn.datasets import load_digits
digits = load_digits()
train = range(0, 1700)
test = range(1700, len(digits.data))
X = digits.data[train]
y = digits.target[train]
tX = digits.data[test]
ty = digits.target[test]

In [28]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
OVR = OneVsRestClassifier(LogisticRegression(max_iter=2000)).fit(X, y)
OVO = OneVsOneClassifier(LogisticRegression(max_iter=2000)).fit(X, y)
print('One vs rest accuracy: %.3f' % OVR.score(tX, ty))
print('One vs one accuracy: %.3f' % OVO.score(tX, ty))

One vs rest accuracy: 0.959
One vs one accuracy: 0.979


# Making Things as Simple as Naïve Bayes

## Predicting text classifications

In [30]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(
    subset='train', 
    remove=('headers', 'footers','quotes'))
newsgroups_test = fetch_20newsgroups(
    subset='test', 
    remove=('headers', 'footers','quotes'))

In [32]:
print('number of posts in training: %i'
% len(newsgroups_train.data))
D={word:True for post in newsgroups_train.data
for word in post.split(' ')}
print('number of distinct words in training: %i'
% len(D))
print('number of posts in test: %i'
% len(newsgroups_test.data))

number of posts in training: 11314
number of distinct words in training: 300972
number of posts in test: 7532


In [33]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

In [34]:
import sklearn.feature_extraction.text as txt
multinomial = txt.HashingVectorizer(stop_words='english',
binary=False, norm=None)
binary = txt.HashingVectorizer(stop_words='english',
binary=True, norm=None)


In [37]:
import numpy as np
target = newsgroups_train.target
target_test = newsgroups_test.target
multi_X = np.abs(
multinomial.transform(newsgroups_train.data))
multi_Xt = np.abs(
multinomial.transform(newsgroups_test.data))
bin_X = binary.transform(newsgroups_train.data)
bin_Xt = binary.transform(newsgroups_test.data)
Multinomial.fit(multi_X, target)
Bernoulli.fit(bin_X, target)
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
for name, model, data in [('BernoulliNB', Bernoulli, bin_Xt),
    ('MultinomialNB', Multinomial, multi_Xt)]:
    accuracy = accuracy_score(y_true=target_test,
                              y_pred=model.predict(data))
    print ('Accuracy for %s: %.3f' % (name, accuracy))

Accuracy for BernoulliNB: 0.570
Accuracy for MultinomialNB: 0.651


# Exploring Lazy Learning with K-nearest Neighbors

## Predicting after observing neighbors

In [44]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
train = range(0, 1700)
test = range(1700, len(digits.data))
pca = PCA(n_components = 25)
pca.fit(digits.data[train])
X = pca.transform(digits.data[train])
y = digits.target[train]
tX = pca.transform(digits.data[test])
ty = digits.target[test]

In [45]:
print('Accuracy: %.3f' % kNN.score(tX,ty) )
print('Prediction: %s Actual: %s'
% (kNN.predict(tX[-15:,:]),ty[-15:]))


Accuracy: 0.907
Prediction: [2 2 5 7 9 5 4 8 1 4 9 0 8 9 8] Actual: [2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]


## Choosing wisely your k parameter

In [43]:
for k in [1, 5, 10, 50, 100, 200]:
    kNN = KNeighborsClassifier(n_neighbors=k).fit(X, y)
    print('for k = %3i accuracy is %.3f' 
          % (k, kNN.score(tX, ty)))

for k =   1 accuracy is 0.979
for k =   5 accuracy is 0.990
for k =  10 accuracy is 0.969
for k =  50 accuracy is 0.959
for k = 100 accuracy is 0.959
for k = 200 accuracy is 0.907
