# Classification with scikit-learn
More details in http://scikit-learn.org

## Very simple classification example

In [1]:
from sklearn import datasets
import sklearn.metrics as metrics

### Dataset
This data sets consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica) petal and sepal length, stored in a 150x4 numpy.ndarray.

The rows being the samples and the columns being: Sepal Length, Sepal Width, Petal Length and Petal Width.

In [2]:
iris = datasets.load_iris()
X, y = iris.data, iris.target

In [None]:
X[:10]

In [None]:
y

In [None]:
y.shape

### Gaussian Naive Bayes

In [6]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [7]:
model = gnb.fit(X, y)

In [None]:
y_pred = model.predict(X)
print("Mislabeled points: %d out of %d"% ((y!=y_pred).sum(), X.shape[0]))
print("Accuracy: ", metrics.accuracy_score(y, y_pred))
print("Precision: ", metrics.precision_score(y, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(y, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y, y_pred, average="macro"))

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=200)
model=lr.fit(X, y)

In [None]:
y_pred = model.predict(X)
print("Mislabeled points: %d out of %d"% ((y!=y_pred).sum(), X.shape[0]))
print("Accuracy: ", metrics.accuracy_score(y, y_pred))
print("Precision: ", metrics.precision_score(y, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(y, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y, y_pred, average="macro"))

### Support Vector Machine (SVM)

In [None]:
from sklearn import svm
clf = svm.SVC()
model = clf.fit(X, y)

In [None]:
y_pred = model.predict(X)
print("Mislabeled points: %d out of %d"% ((y!=y_pred).sum(), X.shape[0]))
print("Accuracy: ", metrics.accuracy_score(y, y_pred))
print("Precision: ", metrics.precision_score(y, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(y, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y, y_pred, average="macro"))

## Text classification

In [14]:
import csv
import collections
import numpy as np

### Loading the dataset

In [None]:
classes=('ham', 'spam')
textos = []
y = []

with open("../data/SMSSpamCollection.txt", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="\t")
    for i, row in enumerate(reader):
        if row["class"] == classes[1]:
            y.append(1)
        else:
            y.append(0)
        textos.append(row["text"])

len(textos)

### Document Representation

In [None]:
textos[:10]

In [17]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()
docs = []

for t in textos:
    doc = collections.Counter()
    for w in tokenizer.tokenize(t):
        doc[w] += 1
    docs.append(doc)

In [None]:
docs[1]

In [19]:
voc_length = 3000

tf = collections.Counter()
df = collections.Counter()

for d in docs:
    for w in d:
        tf[w] += d[w]
        df[w] += 1

idfs = {}
for w in tf:
    if tf[w] > 2:
        idfs[w] = np.log(len(docs)/df[w])

voc = sorted(idfs, key=idfs.get, reverse=True)[:voc_length]

In [None]:
print(voc)
print(len(voc))

In [21]:
indice = {}
for i,w in enumerate(sorted(voc)):
    indice[w] = i

In [22]:
docrep = []
for d in docs:
    valores = np.zeros([len(voc)])
    for w in d:
        if w in indice:
            valores[ indice[w] ] = d[w]
    docrep.append ( valores )

In [None]:
docrep[:10]

### Dataset definition

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    docrep, y, test_size=0.2, random_state=42)

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics

nb = MultinomialNB()
model = nb.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print("Mislabeled points: %d out of %d"% ((y_test!=y_test_pred).sum(), len(y_test)))
print("Accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("Precision: ", metrics.precision_score(y_test, y_test_pred, average="macro"))
print("Recall: ", metrics.recall_score(y_test, y_test_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y_test, y_test_pred, average="macro"))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

lr = LogisticRegression()
model=lr.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print("Mislabeled points: %d out of %d"% ((y_test!=y_test_pred).sum(), len(y_test)))
print("Accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("Precision: ", metrics.precision_score(y_test, y_test_pred, average="macro"))
print("Recall: ", metrics.recall_score(y_test, y_test_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y_test, y_test_pred, average="macro"))

### SVM

In [None]:
import sklearn.metrics as metrics
from sklearn.svm import LinearSVC

svmc = LinearSVC(max_iter=500)
model = svmc.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print("Mislabeled points: %d out of %d"% ((y_test!=y_test_pred).sum(), len(y_test)))
print("Accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("Precision: ", metrics.precision_score(y_test, y_test_pred, average="macro"))
print("Recall: ", metrics.recall_score(y_test, y_test_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(y_test, y_test_pred, average="macro"))

## Using NLTK
Please be aware that NLTK is usually slower than using scikit-learn 

### A very simple example

In [38]:
import nltk
from nltk.classify import maxent

In [None]:
train = [
 ({'a': 1, 'b': 1, 'c': 1}, 'y'),
 ({'a': 5, 'b': 5, 'c': 5}, 'x'),
 ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'),
 ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'),
 ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'),
 ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x')
]
test = [
  {'a': 1, 'b': 0.8, 'c': 1.2},
  {'a': 5.2, 'b': 5.1, 'c': 5}
]
classifier = maxent.MaxentClassifier.train(train, max_iter=10)

In [None]:
nltk.classify.accuracy(classifier,train)

In [None]:
classifier.classify_many(test)

### NLTK for text classification

Assuming that the document is already processed (see above Document Representation steps)

In [42]:
import nltk
from nltk.classify import naivebayes
from nltk.classify import maxent
import random

In [None]:
newdocrep = []
for d,c in zip(docs, y):
    docwords={}
    for w in d:
        if w in indice:
            docwords[w] = d[w]
    newdocrep.append ( (docwords, classes[c] ) )
newdocrep[9]

random.shuffle(newdocrep)
len(newdocrep)

#### Naive Bayes

In [None]:
nbc = naivebayes.NaiveBayesClassifier.train(newdocrep[:int(len(newdocrep)*.8)])
nltk.classify.accuracy(nbc, newdocrep[int(len(newdocrep)*.8):])

#### Maximum Entropy (Logistic Regression)

In [None]:
mec = maxent.MaxentClassifier.train(newdocrep[:int(len(newdocrep)*.8)], max_iter=10, trace=3)
nltk.classify.accuracy(mec, newdocrep[int(len(newdocrep)*.8):])

## TextBlob

TextBlob também permite treinar um classificador (opcional)

Additional references: [Tutorial: Building a Text Classification System](https://textblob.readthedocs.io/en/dev/classifiers.html#loading-data-and-creating-a-classifier)

In [None]:
%pip install textblob

In [49]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [50]:
train = [
     ('I love this sandwich.', 'pos'),
     ('this is an amazing place!', 'pos'),
     ('I feel very good about these beers.', 'pos'),
     ('this is my best work.', 'pos'),
     ("what an awesome view", 'pos'),
     ('I do not like this restaurant', 'neg'),
     ('I am tired of this stuff.', 'neg'),
     ("I can't deal with this", 'neg'),
     ('he is my sworn enemy!', 'neg'),
     ('my boss is horrible.', 'neg')
 ]

In [51]:
cl = NaiveBayesClassifier(train)

In [None]:
cl.classify("This is an amazing library!")