In [None]:
import json

import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
var_pairs = pd.read_parquet("hackathon_files_for_participants_ozon/train_pairs.parquet")
products_all = pd.read_parquet("hackathon_files_for_participants_ozon/train_data.parquet")

In [None]:
products_all = products_all[
    ['name', 'categories', 'characteristic_attributes_mapping', 'variantid']
].dropna()
products_all.categories = products_all.categories.apply(lambda x: json.loads(x)["3"])

In [None]:
products_all.shape

In [None]:
product_pairs = var_pairs.merge(
    products_all.add_suffix('1'),
    on="variantid1"
).merge(
    products_all.add_suffix('2'),
    on="variantid2"
)

In [None]:
identcial_pairs = product_pairs[product_pairs['target'] == 1]
various_pairs = product_pairs[product_pairs['target'] == 0]

In [None]:
identcial_pairs.shape, various_pairs.shape

In [None]:
attr_freq = collections.Counter(
    attr_name 
    for attributes1, attributes2 in zip(product_pairs.characteristic_attributes_mapping1,
                                        product_pairs.characteristic_attributes_mapping2)
        for attr_name in set(json.loads(attributes1.lower())) & set(json.loads(attributes2.lower()))
)

In [None]:
%%time

vectorizer = TfidfVectorizer()
   
count_not_matched_attr_for_various_pairs = collections.Counter(
    attr_name 
    if (
        (
            (' '.join(json.loads(attributes1.lower())[attr_name])) != (
             ' '.join(json.loads(attributes2.lower())[attr_name]))
        ) and (
            cosine_similarity(
                *vectorizer.fit_transform(
                    [
                        ' '.join(json.loads(attributes1.lower())[attr_name]) + 'tr',
                        ' '.join(json.loads(attributes2.lower())[attr_name]) + 'tr'
                    ]
                )
            ) < 0.5
        )
    ) else 'matched'
    for (attributes1, attributes2) in zip(various_pairs.characteristic_attributes_mapping1, 
                                          various_pairs.characteristic_attributes_mapping2)
        for attr_name in set(json.loads(attributes1.lower())) & set(json.loads(attributes2.lower()))
)

In [None]:
del count_not_matched_attr_for_various_pairs['matched']

In [None]:
%%time
count_not_matched_attr_for_ident_pairs = collections.Counter(
    attr_name 
      if (
        (
            (' '.join(json.loads(attributes1.lower())[attr_name])) != (
             ' '.join(json.loads(attributes2.lower())[attr_name]))
        ) and (
            cosine_similarity(
                *vectorizer.fit_transform(
                    [
                        ' '.join(json.loads(attributes1.lower())[attr_name]) + 'tr',
                        ' '.join(json.loads(attributes2.lower())[attr_name]) + 'tr'
                    ]
                )
            ) < 0.5
        )
    ) else 'matched'

    for (attributes1, attributes2) in zip(identcial_pairs.characteristic_attributes_mapping1, 
                                          identcial_pairs.characteristic_attributes_mapping2)
        for attr_name in set(json.loads(attributes1.lower())) & set(json.loads(attributes2.lower()))
)

In [None]:
del count_not_matched_attr_for_ident_pairs['matched']

In [None]:
%%time
count_not_matched_attr_for_ident_pairs_norm = {
    attr: freq / attr_freq[attr]
    for attr, freq in count_not_matched_attr_for_indent_pairs.items()
}
count_not_matched_attr_for_ident_pairs_norm = dict(
    sorted(count_not_matched_attr_for_ident_pairs_norm.items(),
           key=lambda x: x[1],
           reverse=True)
)
count_not_matched_attr_for_various_pairs_norm = {
    attr: freq / attr_freq[attr]
    for attr, freq in count_not_matched_attr_for_various_pairs.items()
}
count_not_matched_attr_for_various_pairs_norm = dict(
    sorted(count_not_matched_attr_for_various_pairs_norm.items(),
           key=lambda x: x[1],
           reverse=True)
)

In [None]:
set(list(count_not_matched_attr_for_various_pairs_norm.keys())[:50]) & set(list(count_not_matched_attr_for_ident_pairs_norm.keys())[:50])

In [None]:
fig, (ax) = plt.subplots(1, 1, figsize=(20, 10))

ax.barh(
    list(count_not_matched_attr_for_indent_pairs_norm.keys())[:50], 
    list(count_not_matched_attr_for_indent_pairs_norm.values())[:50]
)
fig.savefig('./count_not_matched_attr_for_indent_pairs_norm.png')

fig2, (ax2) = plt.subplots(1, 1, figsize=(20, 10))

ax2.barh(
    list(count_not_matched_attr_for_various_pairs_norm.keys())[:50], 
    list(count_not_matched_attr_for_various_pairs_norm.values())[:50]
)
fig2.savefig('./count_not_matched_attr_for_various_pairs_norm.png')

In [None]:
with open('count_not_matched_attr_for_various_pairs.json', 'w') as f:
    json.dump(count_not_matched_attr_for_various_pairs, f, sort_keys=True)

In [None]:
with open('count_not_matched_attr_for_indent_pairs.json', 'w') as f:
    json.dump(count_not_matched_attr_for_indent_pairs, f, sort_keys=True)

In [None]:
docs = (
    ' '.join(json.loads(products_all.loc[i].characteristic_attributes_mapping.lower())[attr])
    for i in products_all.index
        for attr in json.loads(products_all.loc[i].characteristic_attributes_mapping.lower())
)

In [None]:

rnd = np.random.choice(various_pairs.index)

attrs = set(json.loads(product_pairs.loc[rnd].characteristic_attributes_mapping1.lower())) & set(
    json.loads(product_pairs.loc[rnd].characteristic_attributes_mapping2.lower())
)

if any(attrs):

    vectorizer = TfidfVectorizer()
    a = {
        attr:  cosine_similarity(
            *vectorizer.fit_transform(
                [
                    ' '.join(json.loads(product_pairs.loc[rnd].characteristic_attributes_mapping1.lower())[attr]),
                    ' '.join(json.loads(product_pairs.loc[rnd].characteristic_attributes_mapping2.lower())[attr])
                ]
            )
        ) for attr in attrs if len(
            ' '.join(json.loads(product_pairs.loc[rnd].characteristic_attributes_mapping1.lower())[attr])
        ) > 3
    }

    
a

In [40]:
cosine_similarity(
    *TfidfVectorizer(
        stop_words=[],
        min_df=0.1,
        max_df=0.2
    ).fit_transform(['3 гб', '2 гб'])
)

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [None]:
    vectorizer = TfidfVectorizer(min_df=0.6)

    for attr in attrs:
        print(' '.join(json.loads(identcial_pairs.loc[rnd].characteristic_attributes_mapping1.lower())[attr]), '+++',
              ' '.join(json.loads(identcial_pairs.loc[rnd].characteristic_attributes_mapping2.lower())[attr]))
        print(vectorizer.fit_transform(
                [
                    ' '.join(json.loads(identcial_pairs.loc[rnd].characteristic_attributes_mapping1.lower())[attr]),
                    ' '.join(json.loads(identcial_pairs.loc[rnd].characteristic_attributes_mapping2.lower())[attr])
                ]
            ))
