## Supervised Learning - Regression

*y* is a real number

In [21]:
from sklearn import linear_model
'''
+--------------+
| frev         |
+--------------+
| x1 | x2 | y  |
+----+----+----+
| 2  | 3  | 4  |
+----+----+----+
| 8  | 5  | 9  |
+----+----+----+
| 6  | 4  | 7  |
+----+----+----+
| 10 | 2  | 7  |
+----+----+----+
| 4  | 2  | 4  |
+----+----+----+
| 16 | 7  | 15 |
+----+----+----+
'''
model = linear_model.LinearRegression()

#TBD: Fit/Train the model from observed data
X = [[2, 3], [8, 5], [6, 4], [10, 2], [4, 2], [16, 7]]
Y = [4, 9, 7, 7, 4, 15]
regression_model = model.fit(X, Y)
print('Model score: ' + str(regression_model.score(X, Y)))

#TBD: Use fitted/trained model to predict for any given x1, x2
def predict(x1, x2):
    return round(regression_model.predict([[x1, x2]])[0], 2)

predict(8, 3)

Model score: 1.0


7.0

## Supervised Learning - Classification
*y* is a class/category

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model.logistic import LogisticRegression
import numpy as np

'''
    Building a labelled dataset of documents
'''
doc0 = "hello world"
doc1 = "foo bar"
doc2 = "lottery prize winner"
docs_train = [doc0] * 100 + [doc1] * 100 + [doc2] * 4 

docs_test = [
             "lottery winner",
             "hello foo", 
             "hello bar", 
             "lottery prize", 
             "world foo",
             "prize winner",
            ]

'''
    Converting documents to feature vectors
'''
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(docs_train).toarray()
features = vectorizer.get_feature_names()
print(f'Features: {features}')
Y_train = np.array([[0] * 100 + [0] * 100 + [1.0] * 4]).reshape(204,)
X_test = vectorizer.transform(docs_test).toarray()

'''
    Choosing the model
'''
classifier = LogisticRegression()

'''
    Train the model
'''
#TBD Fit the model to training data
classifier_model = classifier.fit(X_train, Y_train)


'''
    Test the model
'''
#TBD Use fitted model to predict if the documents in test data are spam or not

predictions = [classifier_model.predict(X_test)]
print(f'Spam classification (1 means document is spam):\n====\n{predictions}\n====\n')

Features: ['bar', 'foo', 'hello', 'lottery', 'prize', 'winner', 'world']
Spam classification (1 means document is spam):
====
[array([0., 0., 0., 0., 0., 0.])]
====

