In [2]:
! pip install creme

Collecting creme
  Downloading creme-0.6.1.tar.gz (524 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.7/524.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mmh3==2.5.1 (from creme)
  Downloading mmh3-2.5.1.tar.gz (9.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: creme, mmh3
  Building wheel for creme (setup.py) ... [?25l[?25hdone
  Created wheel for creme: filename=creme-0.6.1-cp310-cp310-linux_x86_64.whl size=1233932 sha256=2f27515a1fe48320ff48badbade6a689e5c1cc46a7969a7bd0d37e745114db70
  Stored in directory: /root/.cache/pip/wheels/55/7a/6c/8156f131ab47128de819b7c50ecd442a7cca65b65ccc7559bb
  Building wheel for mmh3 (setup.py) ... [?25l[?25hdone
  Created wheel for mmh3: filename=mmh3-2.5.1-cp310-cp310-linux_x86_64.whl size=32613 sha256=722aedfc9479d3438457cbe79f55a5a7c0c9cb05deeb33d6e013a6ead673b430
  Stored in directory: /root/.cach

In [19]:
import math
import creme
from creme import compose
from creme import feature_extraction
from creme import naive_bayes


In [20]:
# Creme will only take Train, Test Combined data into Tuples Unlike Sklearn.
docs = [('Chinese Beijing Chinese', 'yes'), ('Chinese Chinese Shanghai', 'yes'), ('Chinese Macao', 'yes'), ('Tokyo Japan Chinese', 'no')]

In [21]:
model = compose.Pipeline(
    ('tokenize', feature_extraction.BagOfWords(lowercase=False)),
    ('nb', naive_bayes.MultinomialNB(alpha=1))
)

In [22]:
## This is just an example :
corpus = [
      'This is the first document.',
      'This document is the second document.',
      'And this is the third one.',
      'Is this the first document?',
]
bow = creme.feature_extraction.BagOfWords()
for sentence in corpus:
  print(bow.transform_one(sentence))



Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1})
Counter({'document': 2, 'this': 1, 'is': 1, 'the': 1, 'second': 1})
Counter({'and': 1, 'this': 1, 'is': 1, 'the': 1, 'third': 1, 'one': 1})
Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})


In [23]:
# Training all document one by one
%%time
for sentence, label in docs:
  model = model.fit_one(sentence, label)

CPU times: user 233 µs, sys: 0 ns, total: 233 µs
Wall time: 241 µs


In [24]:
new_unseen_text = 'Tokyo india'
model.predict_one(new_unseen_text)

'no'

In [25]:
# Training on a new text data and New category(label)
model.fit_one('India USA', 'may be')

Pipeline (
  BagOfWords (
    on=None
    strip_accents=True
    lowercase=False
    preprocessor=None
    tokenizer=<built-in method findall of re.Pattern object at 0x79446aa44580>
    ngram_range=(1, 1)
  ),
  MultinomialNB (
    alpha=1
  )
)

In [26]:
model.predict_one("India Germany")

'may be'

In [48]:
# Creme With Naive Bayes & logistic Regression on Email Spam Detection(Huge Data):
import pandas as pd

messages = pd.read_csv('./SMSSpamCollection', sep='\t', names=["label", "message"])
messages.shape

(5572, 2)

In [49]:
from sklearn.model_selection import train_test_split

message_train, message_test = train_test_split(messages)

In [50]:
message_train

Unnamed: 0,label,message
5083,ham,Aiya we discuss later lar... Pick ü up at 4 is...
3707,ham,Reading gud habit.. Nan bari hudgi yorge patai...
6,ham,Even my brother is not like to speak with me. ...
3381,ham,Just finished. Missing you plenty
5249,ham,"K I'm leaving soon, be there a little after 9"
...,...,...
4274,ham,Kind of. Just missed train cos of asthma attac...
4523,ham,DO U WANT 2 MEET UP 2MORRO
3322,ham,Yo im right by yo work
304,ham,He is a womdarfull actor


In [51]:
# Convert Dataframe to List of Tuples :
message_train = message_train.to_records(index=False)
message_test = message_test.to_records(index=False)



In [52]:
message_train

rec.array([('ham', 'Aiya we discuss later lar... Pick ü up at 4 is it?'),
           ('ham', 'Reading gud habit.. Nan bari hudgi yorge pataistha ertini kano:-)'),
           ('ham', 'Even my brother is not like to speak with me. They treat me like aids patent.'),
           ..., ('ham', 'Yo im right by yo work'),
           ('ham', 'He is a womdarfull actor'),
           ('ham', 'Spending new years with my brother and his family. Lets plan to meet next week. Are you ready to be spoiled? :)')],
          dtype=[('label', 'O'), ('message', 'O')])

In [53]:
# Creating the pipeline
# 1st function is creating the TFIDF
# 2nd function is the naive bayes predictor

model = compose.Pipeline(
    ('tokenize', feature_extraction.TFIDF(lowercase=False)),
    ('nb', naive_bayes.MultinomialNB(alpha=1))

)

In [54]:
from creme import metrics
metric = metrics.Accuracy()
# Training the model row by row :

for label, sentence in message_train:
  model = model.fit_one(sentence, label)
  y_pred = model.predict_one(sentence)
  metric = metric.update(label, y_pred)
  print(metric)


Accuracy: 100.00%
Accuracy: 100.00%
Accuracy: 100.00%
Accuracy: 100.00%
Accuracy: 100.00%
Accuracy: 100.00%
Accuracy: 85.71%
Accuracy: 87.50%
Accuracy: 88.89%
Accuracy: 90.00%
Accuracy: 90.91%
Accuracy: 91.67%
Accuracy: 92.31%
Accuracy: 92.86%
Accuracy: 93.33%
Accuracy: 93.75%
Accuracy: 94.12%
Accuracy: 94.44%
Accuracy: 94.74%
Accuracy: 95.00%
Accuracy: 95.24%
Accuracy: 95.45%
Accuracy: 95.65%
Accuracy: 95.83%
Accuracy: 96.00%
Accuracy: 96.15%
Accuracy: 96.30%
Accuracy: 96.43%
Accuracy: 96.55%
Accuracy: 96.67%
Accuracy: 96.77%
Accuracy: 96.87%
Accuracy: 96.97%
Accuracy: 97.06%
Accuracy: 97.14%
Accuracy: 94.44%
Accuracy: 94.59%
Accuracy: 94.74%
Accuracy: 94.87%
Accuracy: 95.00%
Accuracy: 95.12%
Accuracy: 95.24%
Accuracy: 95.35%
Accuracy: 95.45%
Accuracy: 93.33%
Accuracy: 93.48%
Accuracy: 93.62%
Accuracy: 93.75%
Accuracy: 93.88%
Accuracy: 94.00%
Accuracy: 94.12%
Accuracy: 94.23%
Accuracy: 94.34%
Accuracy: 94.44%
Accuracy: 94.55%
Accuracy: 94.64%
Accuracy: 94.74%
Accuracy: 94.83%
Accuracy

In [55]:
# Training data Accuracy :

metric

Accuracy: 95.88%

In [56]:
# Test data accuracy :
test_metric = metrics.Accuracy()
# Training the model row by row :

for label, sentence in message_test:
  # model = model.fit_one(sentence, label)
  y_pred = model.predict_one(sentence)
  test_metric = metric.update(label, y_pred)


In [57]:
test_metric

Accuracy: 96.05%

In [64]:
# Train on new Category and Email Text Data :

model.fit_one("United States", "neutral")
model.fit_one("Everyone is neutral", "ham")


Pipeline (
  TFIDF (
    normalize=True
    on=None
    strip_accents=True
    lowercase=False
    preprocessor=None
    tokenizer=<built-in method findall of re.Pattern object at 0x79446aa44580>
    ngram_range=(1, 1)
  ),
  MultinomialNB (
    alpha=1
  )
)

In [61]:
# model.predict_one("This guy is neutral")
model.predict_one("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")

'spam'

In [70]:
model.predict_one("Free entry")

'spam'

In [71]:
# Applying Logistic Regression:
from creme import datasets
from creme import linear_model
from creme import preprocessing

X_y = datasets.Phishing()

model = compose.Pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression()
)
metric = metrics.Accuracy()

for x, y in X_y:
  y_pred = model.predict_one(x)
  metric = metric.update(y, y_pred)
  model = model.fit_one(x, y)

metric


Accuracy: 89.20%

In [73]:
import pandas as pd

phishing_df = pd.DataFrame(X_y)
print(phishing_df.head())

                                                   0      1
0  {'empty_server_form_handler': 0.0, 'popup_wind...   True
1  {'empty_server_form_handler': 1.0, 'popup_wind...   True
2  {'empty_server_form_handler': 0.0, 'popup_wind...   True
3  {'empty_server_form_handler': 0.0, 'popup_wind...   True
4  {'empty_server_form_handler': 1.0, 'popup_wind...  False


In [74]:
print(X_y)

Phishing dataset

              Task  Binary classification                                                 
 Number of samples  1,250                                                                 
Number of features  9                                                                     
            Sparse  False                                                                 
              Path  /usr/local/lib/python3.10/dist-packages/creme/datasets/phishing.csv.gz
