In [2]:
import pandas as pd
import numpy as np

import mlflow
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [4]:
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)


In [5]:
df = pd.DataFrame({'text':twenty_train['data']})
df['target'] = twenty_train['target']
df

Unnamed: 0,text,target
0,From: sd345@city.ac.uk (Michael Collier)\nSubj...,1
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\...,1
2,From: djohnson@cs.ucsd.edu (Darin Johnson)\nSu...,3
3,From: s0612596@let.rug.nl (M.M. Zwart)\nSubjec...,3
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3
...,...,...
2252,From: roos@Operoni.Helsinki.FI (Christophe Roo...,2
2253,From: mhollowa@ic.sunysb.edu (Michael Holloway...,2
2254,From: sasghm@theseus.unx.sas.com (Gary Merrill...,2
2255,From: Dan Wallach <dwallach@cs.berkeley.edu>\n...,2


In [6]:
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
X = vectorizer.fit_transform(df['text'])

In [7]:
features = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names())
features

Unnamed: 0,10,1993,apr,article,atheists,available,believe,better,bible,book,...,used,ve,version,want,way,work,world,writes,wrong,years
0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.221158,0.000000,0.000000,0.000000,0.0,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.096475,0.000000,0.0,0.000000,...,0.000000,0.294818,0.00000,0.414974,0.000000,0.331031,0.000000,0.000000,0.0,0.218393
3,0.215832,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.401030,0.417452,0.240445,0.0,0.000000,0.185886,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.108546,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2252,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2253,0.000000,0.112014,0.000000,0.000000,0.0,0.000000,0.000000,0.250581,0.0,0.000000,...,0.115179,0.000000,0.00000,0.000000,0.000000,0.237538,0.100850,0.000000,0.0,0.117535
2254,0.000000,0.000000,0.000000,0.000000,0.0,0.154892,0.000000,0.000000,0.0,0.291468,...,0.000000,0.116993,0.00000,0.000000,0.000000,0.262726,0.111544,0.000000,0.0,0.129998
2255,0.034714,0.092489,0.000000,0.055454,0.0,0.231264,0.000000,0.103451,0.0,0.036265,...,0.063402,0.116452,0.18612,0.092201,0.135640,0.196134,0.027757,0.000000,0.0,0.000000


In [8]:
features.mean().sort_values(ascending=False)[:10].index

Index(['edu', 'com', 'subject', 'god', 'lines', 'organization', 'writes',
       'article', 'university', 'people'],
      dtype='object')

In [29]:
class CustomVectorizer(mlflow.pyfunc.PythonModel):

    def __init__(self, data):
        self.vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        self.vectorizer.fit(data)

    def predict(self, context, model_input):
        x1 = self.vectorizer.transform(model_input[0])
        x2 = self.vectorizer.transform(model_input[1])
        return 1-cosine_similarity(x1, x2)

In [30]:
cv = CustomVectorizer(df['text'])
cv.predict('',[df['text'].sample(5), df['text'].sample(5)])

array([[0.65985347, 0.59437366, 0.66930478, 0.95581443, 0.55521848],
       [0.75158592, 0.80396119, 0.96648561, 0.80002433, 0.8343296 ],
       [0.60159295, 0.61199393, 0.97492495, 0.95011594, 0.81072287],
       [0.98306216, 0.58782841, 0.92600413, 0.86903999, 0.72681283],
       [0.82588023, 0.63984274, 0.92105105, 0.96934926, 0.84427793]])

In [11]:
exp_id = mlflow.create_experiment('custom_tfidf')
exp_id

'1'

In [22]:
mlflow.start_run(experiment_id=exp_id)

<ActiveRun: >

In [None]:
mlflow.pyfunc.save_model(path='custom_tfidf_model', python_model=cv)

In [27]:
mlflow.pyfunc.log_model(artifact_path='custom_tfidf_model', python_model=cv)