In [2]:
import json
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

def read_json(json_file):
    with open(json_file) as js:
        json_data = json.load(js)
    return json_data

In [3]:
best = 'sample_dataset_1/bestshots.json'
item = 'sample_dataset_1/itemsets.json'

category = 'sample_dataset_2/category_infos.json'
products = 'sample_dataset_2/products_meta.json'

In [4]:
def EDA_word2vec(best, item, category, products):
    
    bestshots_json = read_json(best)
    itemsets_json = read_json(item)
    category_json = read_json(category)
    products_json = read_json(products)

    category_4 = category_json['421B6D0E746C4E6D']
    category_b = category_json['B57D4F97C0E44A11']

    best_df = pd.read_json(best)
    item_df = pd.read_json(item)
    
    prod_tags = pd.DataFrame(products_json).T.reset_index(drop=True)[['_id', 'tags', 'name', 'images']]
    
    category_en4 = pd.DataFrame(category_4)
    category_enb = pd.DataFrame(category_b)
    
    item_df.rename(columns={'_id' : 'id', 'enterpriseId' : 'enterprise_id'}, inplace=True)
    best_item = pd.merge(best_df, item_df, on=['id', 'enterprise_id', 'projectId'])

    df_b, df_4 = best_item['enterprise_id'].unique().tolist()
    
    def get_productId(items_list):
        new_list = []
        for item in items_list:
            new_list.append(item['productId'])
        return new_list

    best_item['product_id'] = best_item['items'].apply(get_productId)
    best_item.drop(columns=['items'], inplace=True)
    
    def prod_name(categories):
        x = categories.copy()
        w = []
        for i in x:
            for z in prod_tags['_id']:
                if z in i:
                    b = prod_tags.iloc[np.where(prod_tags['_id'] == z)]
                    w.append(b['name'].iloc[0])
        return w
    
    best_item['name'] = best_item['product_id'].apply(prod_name)
    
    best_item = best_item[['enterprise_id', 'awesome_score', 'product_id', 'name', 'top_style']]
 
    best_item_4 = best_item[best_item['enterprise_id'] == df_4]
    best_item_b = best_item[best_item['enterprise_id'] == df_b]
    
    def flatten_data(best_item):
    
        best_items = best_item.join(best_item['product_id'].apply(lambda x: pd.Series(x)).stack().reset_index(1, name='items').drop('level_1', axis=1))
        best_items.drop(columns=['product_id', 'name'],inplace=True)

        return best_items

    def flatten_data2(names, best_items):
        x = names['name'].apply(lambda x: pd.Series(x)).stack().reset_index(1, name='names').drop('level_1', axis=1)
        df = pd.concat([best_items, x], axis=1)
        return df
    
    flatten_data = flatten_data(best_item)
    
    flatten = flatten_data2(best_item,flatten_data)
    
    prod_df = pd.merge(flatten, prod_tags, left_on='items', right_on='_id').drop(['_id'], axis=1)
    prod_df = pd.merge(prod_df, prod_df['items'].value_counts().reset_index(),
                       left_on='items', right_on = 'index').rename(columns = {'items': 'product_id','items_y':'use_count'}).drop(['index'], axis=1)
    prod_df.drop(columns=['name', 'use_count'], inplace=True)
    best_item_4 = best_item[best_item['enterprise_id'] == df_4]
    best_item_b = best_item[best_item['enterprise_id'] == df_b]
    
    
    return  best_item_4, best_item_b, prod_df


In [5]:
def EDA_cf(best, item, category, products):
    
    bestshots_json = read_json(best)
    itemsets_json = read_json(item)
    category_json = read_json(category)
    products_json = read_json(products)

    category_4 = category_json['421B6D0E746C4E6D']
    category_b = category_json['B57D4F97C0E44A11']

    best_df = pd.read_json(best)
    item_df = pd.read_json(item)

    category_en4 = pd.DataFrame(category_4)
    category_enb = pd.DataFrame(category_b)

    item_df.rename(columns={'_id' : 'id', 'enterpriseId' : 'enterprise_id'}, inplace=True)
    best_item = pd.merge(best_df, item_df, on=['id', 'enterprise_id', 'projectId'])

    df_b, df_4 = best_item['enterprise_id'].unique().tolist()
    
    def get_productId(items_list):
        new_list = []
        for item in items_list:
            new_list.append(item['productId'])
        return new_list

    best_item['items'] = best_item['items'].apply(get_productId)
    
    best_item['top3_style'] = best_item['style_predictions'].apply(lambda x: sorted([(name, score) for name, score in x.items()], key=lambda x: x[1], reverse=True)[:3])
    # top 3 style 점수, 이름 가져오기
    best_item['top3_style'] = best_item['top3_style'].apply(lambda x: [name for name, score, in x])
    # top3_style 이름만 가져오기
    best_item['top3_style'] = best_item['top3_style'].apply(lambda x: ' '.join(x))
    # 리스트 제거 
    
    items_stack = pd.DataFrame(best_item['items'].apply(lambda x: pd.Series(x)).stack()).reset_index(1, drop=True)
    
    products_df = pd.merge(best_item[['enterprise_id', 'top3_style', 'top_style', 'projectId', 'awesome_score']]
                       .reset_index(),
                       items_stack.reset_index(),
                       on='index').drop(['index'],axis=1).rename(columns = {0:'product_id'})
    
    prod_tags = pd.DataFrame(products_json).T.reset_index(drop=True)[['_id', 'tags', 'name', 'images']]
    
    prod_df = pd.merge(products_df, prod_tags, left_on='product_id', right_on='_id').drop(['_id'], axis=1)
    prod_df = pd.merge(prod_df, prod_df['product_id'].value_counts().reset_index(),
                       left_on='product_id', right_on = 'index').rename(columns = {'product_id_x': 'product_id','product_id_y':'use_count'}).drop(['index'], axis=1)
    
    prod_4 = prod_df[prod_df['enterprise_id'] == df_4].reset_index(drop=True)
    prod_b = prod_df[prod_df['enterprise_id'] == df_b].reset_index(drop=True)

    return prod_4, prod_b