# Guessing the number: linear regression

## Using more variables

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

def load_california_housing_data():
    dataset = fetch_california_housing()
    X = pd.DataFrame(data=dataset.data, 
                     columns=dataset.feature_names)
    y = pd.Series(data=dataset.target, name="target")
    return X, y

X, y = load_california_housing_data()

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

regression = Pipeline(steps=[
    ('scaler', StandardScaler()), # Scale the data
    ('model', LinearRegression()) # Fit linear regression model
])

regression.fit(X, y)

In [3]:
score = regression.score(X, y)
print(f"{score:.3f}")

0.606


In [4]:
for feature, coefficient in zip(X.columns, 
                                regression['model'].coef_):
    print(f"{feature:12}: {coefficient:>7.3f}")

MedInc      :   0.830
HouseAge    :   0.119
AveRooms    :  -0.266
AveBedrms   :   0.306
Population  :  -0.005
AveOccup    :  -0.039
Latitude    :  -0.900
Longitude   :  -0.871


# Moving to Logistic Regression

## Applying logistic regression

In [5]:
import pandas as pd

def load_palmer_penguins(only_numeric=True, 
                         no_missing=True, 
                         multiclass=True):
    url = "https://raw.githubusercontent.com/"
    url += "allisonhorst/palmerpenguins/main/"
    url += "inst/extdata/penguins.csv"
    numeric_features = ["bill_length_mm", 
                        "bill_depth_mm", 
                        "flipper_length_mm", 
                        "body_mass_g"]
    categorical_features = ["island", "sex"]
    data = pd.read_csv(url)
    if no_missing:
        data = data.dropna()
    if multiclass:
        target = data.species.replace({'Adelie':1, 
                                       'Gentoo':2, 
                                       'Chinstrap':3})
    else:
        target = data.species.replace({'Adelie':1, 
                                       'Gentoo':0, 
                                       'Chinstrap':0})
    if only_numeric:
        return data[numeric_features], target
    else:
        return data[numeric_features + 
                    categorical_features], target
    
X, y = load_palmer_penguins(only_numeric=True, 
                            no_missing=True, 
                            multiclass=False)

In [6]:
from sklearn.linear_model import LogisticRegression

logistic = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())])

logistic.fit(X.iloc[:-1], y.iloc[:-1])

excluded_row = X.iloc[[-1]]
pred = logistic.predict(excluded_row)
proba = logistic.predict_proba(excluded_row)
print (f"Predicted class {pred[0]}, real class " +
       f"{y.iloc[-1]}") 
print (f"with probability {proba[0, 0]:.3f}")

Predicted class 0, real class 0
with probability 0.987


## Considering the case when there are more classes

In [7]:
from sklearn.model_selection import train_test_split

X, y = load_palmer_penguins(only_numeric=True, 
                            no_missing=True, 
                            multiclass=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.multiclass import OneVsOneClassifier
ovr = OneVsRestClassifier(logistic).fit(X_train, y_train)
ovo = OneVsOneClassifier(logistic).fit(X_train, y_train)
print('One vs rest accuracy: %.3f' % ovr.score(X_test, y_test))
print('One vs one accuracy: %.3f' % ovo.score(X_test, y_test))

One vs rest accuracy: 0.973
One vs one accuracy: 0.982


# Making Things as Simple as Naïve Bayes

## Predicting text classifications

In [9]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text \
    import CountVectorizer
import sklearn.feature_extraction.text as txt
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

newsgroups_train = fetch_20newsgroups(
    subset='train', remove=('headers', 'footers', 
                            'quotes'))
newsgroups_test = fetch_20newsgroups(
    subset='test', remove=('headers', 'footers', 
                           'quotes'))

In [10]:
bernoulli_nb = BernoulliNB(alpha=0.01)
multinomial_nb = MultinomialNB(alpha=0.01)

multinomial_vectorizer = CountVectorizer(
    stop_words='english', binary=False)
binary_vectorizer = CountVectorizer(
    stop_words='english', binary=True)

In [11]:
train_targets = newsgroups_train.target
test_targets = newsgroups_test.target

multinomial_X = np.abs(
    multinomial_vectorizer.fit_transform(
        newsgroups_train.data))
multinomial_Xt = np.abs(
    multinomial_vectorizer.transform(
        newsgroups_test.data))
binary_X = binary_vectorizer.fit_transform(
    newsgroups_train.data)
binary_Xt = binary_vectorizer.transform(
    newsgroups_test.data)

In [12]:
multinomial_nb.fit(multinomial_X, train_targets)
bernoulli_nb.fit(binary_X, train_targets)


for name, model, data in [
    ('BernoulliNB', bernoulli_nb, binary_Xt),
    ('MultinomialNB', multinomial_nb, multinomial_Xt)]:
    accuracy = accuracy_score(
        y_true=test_targets, y_pred=model.predict(data))
    print(f"Accuracy for {name}: {accuracy:.3f}")

Accuracy for BernoulliNB: 0.567
Accuracy for MultinomialNB: 0.653


In [13]:
print(f'training posts: {len(newsgroups_train.data)}')
D = {word: True for post in newsgroups_train.data 
     for word in post.split(' ')}
print(f'training words: {len(D)}')
print(f'test posts: {len(newsgroups_test.data)}')

training posts: 11314
training words: 300972
test posts: 7532


# Learning Lazily with Nearest Neighbors

## Predicting after observing neighbors

In [14]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, 
    test_size=0.33, random_state=42)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2)
knn.fit(X_train, y_train)

In [16]:
print('Accuracy: %.3f' % knn.score(X_test, y_test) )
print(f"Prediction: {knn.predict(X_test[-15:,:])}")
print(f"Actual:     {y_test[-15:]}")

Accuracy: 0.993
Prediction: [2 1 1 2 2 4 8 7 5 8 8 9 4 9 0]
Actual:     [2 1 1 2 2 4 8 7 5 8 8 9 4 9 0]


## Choosing your k parameter wisely

In [17]:
for k in [1, 3, 5, 7, 10, 50, 100]:
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(X_train, y_train)
    test_score = kNN.score(X_test, y_test)
    print(f"k= {k:3} \t accuracy= {test_score:.3f}")

k=   1 	 accuracy= 0.985
k=   3 	 accuracy= 0.990
k=   5 	 accuracy= 0.993
k=   7 	 accuracy= 0.990
k=  10 	 accuracy= 0.983
k=  50 	 accuracy= 0.929
k= 100 	 accuracy= 0.899
