In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
token_pattern = r"""(\b[A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]`"'])"""

In [65]:
data = pd.read_parquet("datasets/final/dataset_v2.parquet")
data = data.head(1000)

X_val, X_test, y_val, y_test = train_test_split(
    data["code"],
    data["key"],
    test_size=0.10,
    random_state=137,
    shuffle=True
)

In [66]:
vectorizer = TfidfVectorizer(token_pattern=token_pattern, max_features=5) 
vectorizer.fit(X_val)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False,
                token_pattern='(\\b[A-Za-z_]\\w*\\b|[!\\#\\$%\\&\\*\\+:\\-\\./<=>\\?@\\\\\\^_\\|\\~]+|[ '
                              '\\t\\(\\),;\\{\\}\\[\\]`"\'])',
                tokenizer=None, use_idf=True, vocabulary=None)

In [67]:
documents_n     = len(texts)
idfs            = vectorizer.idf_
features        = list(map(lambda x: x[1],
                           sorted([(value, key) for key, value in vectorizer.vocabulary_.items()])))
features_n      = len(features)
lowercase       = vectorizer.lowercase
features_map    = vectorizer.vocabulary_

def my_tfidf_transform(texts):
    import re
    from collections import Counter
    import numpy as np

    result = []

    for text in texts:
        if lowercase:
            text = text.lower()

        tokens   = re.split(token_pattern, text)
        tokens   = list(filter(lambda x: x in features, tokens))

        tokens_n = len(tokens)
        counter  = Counter(tokens)

        text_result = [0.0 for i in range(features_n)]

        for token in features:
            if token in counter:
                token_idx = features_map[token]

                tf = counter[token] / tokens_n
                idf = idfs[token_idx]

                text_result[token_idx] = tf * idf

        text_result /= np.linalg.norm(text_result)
        result.append(text_result)

    return result

for a,b in zip(vectorizer.transform(X_test).toarray(), my_tfidf_transform(X_test)):
    print("-=-=-=-=-==-")
    print("control")
    print(a)
    print("test")
    print(b)

-=-=-=-=-==-
control
[0.76682579 0.3763303  0.3763303  0.34255996 0.10668594]
test
[0.76682579 0.3763303  0.3763303  0.34255996 0.10668594]
-=-=-=-=-==-
control
[0.94470978 0.11999818 0.11999818 0.06826877 0.27214643]
test
[0.94470978 0.11999818 0.11999818 0.06826877 0.27214643]
-=-=-=-=-==-
control
[0.99558619 0.06100868 0.06100868 0.03305596 0.01647177]
test
[0.99558619 0.06100868 0.06100868 0.03305596 0.01647177]
-=-=-=-=-==-
control
[0.99190651 0.07139613 0.07139613 0.0628929  0.04439764]
test
[0.99190651 0.07139613 0.07139613 0.0628929  0.04439764]
-=-=-=-=-==-
control
[0.99190651 0.07139613 0.07139613 0.0628929  0.04439764]
test
[0.99190651 0.07139613 0.07139613 0.0628929  0.04439764]
-=-=-=-=-==-
control
[0.8975363  0.31148107 0.31148107 0.01968959 0.        ]
test
[0.8975363  0.31148107 0.31148107 0.01968959 0.        ]
-=-=-=-=-==-
control
[0.99193809 0.02781756 0.02781756 0.11078079 0.04731603]
test
[0.99193809 0.02781756 0.02781756 0.11078079 0.04731603]
-=-=-=-=-==-
control