In [1]:
import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

In [2]:
def string2array(x):
    x = x.replace('\n', '').strip('[]')
    x_list = [float(i) for i in x.split(' ') if len(i) != 0]
    return np.array(x_list)

In [3]:
def data_to_normal(data,user_id,timestamp,rating,wine_id):
    grouped_data = data.groupby(user_id)[rating].agg(['mean', 'std','count'])

    # 여러개 구매한 유저
    other_user = grouped_data[(grouped_data['count']>=5) & (grouped_data['std'] != 0)]

    other_user

    other_userlist = list(other_user.index)

    other_user_data = data[data[user_id].isin(other_userlist)].sort_values(by=user_id)
    other_user_data

    other_data = pd.merge(other_user_data,other_user, on =user_id,how='left')

    other_data = other_data.set_index(other_user_data.index)

    other_data['scaled_rating'] = (other_data[rating]-other_data['mean'])/other_data['std']
    print(other_data['scaled_rating'].quantile(0.75))
    result = other_data[[user_id,timestamp,'scaled_rating',wine_id]]
    result.rename(columns = {'scaled_rating':'rating'})
    return result


In [4]:
item_data = pd.read_csv('/opt/ml/wine/data/item_data_vec.csv', encoding='utf-8-sig')

In [5]:
def bayesian_average(df, min_votes=20, prior_rating=None):
    v = df['num_votes']
    R = df['wine_rating']
    
    if prior_rating is None:
        C = df['wine_rating'].mean()  # Mean rating as the prior rating
    else:
        C = prior_rating

    m = min_votes

    df['popularity_adjusted_rating'] = ((v / (v + m)) * R) + ((m / (v + m)) * C)
    return df

In [6]:
min_votes = 20
item_data = bayesian_average(item_data, min_votes)
# Calculate Bayesian average and add it as a new column in the DataFrame

In [7]:
item_data = item_data.rename(columns = {'item_id':'wine_id'})

In [8]:
note_cols = ['Red Fruit', 'Tropical', 'Tree Fruit', 'Oaky', 'Ageing', 'Black Fruit', 'Citrus', 'Dried Fruit', 'Earthy', 'Floral', 'Microbio', 'Spices', 'Vegetal']
for idx in tqdm(item_data.index):
    total = item_data.loc[idx, note_cols].sum()
    if total != 0:
        for col in note_cols:
            item_data.loc[idx, col] = (item_data.loc[idx, col]/total) * 100
    else:
        item_data.loc[idx, note_cols] = 0

100%|██████████| 74922/74922 [08:11<00:00, 152.53it/s]


In [9]:
with open('/opt/ml/wine/data/wine_vector.json','r') as f: vec = json.load(f)

In [10]:
inter = pd.read_csv('/opt/ml/wine/data/inter_sample.csv', 
                                    encoding='utf-8-sig',
                                    usecols = ['email','rating','timestamp','wine_id'])

inter = inter.dropna(subset=['wine_id'], axis=0)
inter['wine_id'] = inter['wine_id'].astype(int).astype('category')
inter = inter[inter['wine_id'].isin(item_data['wine_id'])]
inter = data_to_normal(inter,'email','timestamp','rating','wine_id')
users = inter['email'].unique()
user2idx = {feature: index for index, feature in enumerate(users)}
idx2user = {index: feature for index, feature in enumerate(users)}

FileNotFoundError: [Errno 2] No such file or directory: '/opt/ml/wine/data/inter_sample.csv'

In [None]:
col_to_use = ['wine_id', 'price', 'winetype', 'country', 'Red Fruit', 'Tropical', 'Tree Fruit', 'Oaky', 'Ageing', 'Black Fruit', 'Citrus', 'Dried Fruit', 'Earthy', 'Floral', 'Microbio', 'Spices', 'Vegetal', 'Light', 'Bold', 'Smooth', 'Tannic', 'Dry', 'Sweet', 'Soft', 'Acidic', 'Fizzy', 'Gentle']

inter = pd.merge(inter, item_data.loc[:, col_to_use], on='wine_id', how='inner')

# Group by 'email' and calculate the count of 'wine_id', mean of 'rating' and mean of other columns
user_data = inter.groupby('email').agg({'wine_id': 'count',
                                        'scaled_rating': 'mean',
                                        'price': 'mean',
                                        'Red Fruit': 'mean',
                                        'Tropical': 'mean',
                                        'Tree Fruit': 'mean',
                                        'Oaky': 'mean',
                                        'Ageing': 'mean',
                                        'Black Fruit': 'mean',
                                        'Citrus': 'mean',
                                        'Dried Fruit': 'mean',
                                        'Earthy': 'mean',
                                        'Floral': 'mean',
                                        'Microbio': 'mean',
                                        'Spices': 'mean',
                                        'Vegetal': 'mean',
                                        'Light': 'mean',
                                        'Bold': 'mean',
                                        'Smooth': 'mean',
                                        'Tannic': 'mean',
                                        'Dry': 'mean',
                                        'Sweet': 'mean',
                                        'Soft': 'mean',
                                        'Acidic': 'mean',
                                        'Fizzy': 'mean',
                                        'Gentle': 'mean'}).sort_values(by='wine_id', ascending=False)


In [None]:
heavy_user_rating_correlation = user_data['wine_id'].corr(user_data['rating'])
heavy_user_price_correlation = user_data['wine_id'].corr(user_data['price'])


In [None]:
user_data_except_1 = user_data[user_data['wine_id'] > 2]

In [None]:
user_data_except_1.describe()

In [None]:
heavy_user_rating_correlation = user_data_except_1['wine_id'].corr(user_data_except_1['rating'])
heavy_user_price_correlation = user_data_except_1['wine_id'].corr(user_data_except_1['price'])
print(heavy_user_price_correlation, heavy_user_rating_correlation)

In [None]:
custom_bins = [0, 5, 10, 50, 100, 200, 500, 1000, float('inf')]
custom_labels = ['0-5', '5-10', '10-50', '50-100', '100-200', '200-500', '500-1000', '1000+']
# Create a new column in the DataFrame with the custom bins
user_data['vote_bins'] =pd.cut(user_data['wine_id'], bins=custom_bins, labels=custom_labels, right=False)

# Group the data by the custom bins and get the frequency count for each bin
bin_counts = user_data['vote_bins'].value_counts().sort_index()

# Create a bar plot to visualize the frequency distribution
plt.bar(bin_counts.index, bin_counts.values)

# Set labels and title
plt.xlabel('Number of Votes')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Number of Votes')

# Show the plot
plt.show()

In [None]:
user_data['vote_bins'].value_counts()

In [None]:
heavy_user = user_data[user_data['wine_id'] >= 50 ]
medium_user = user_data[(user_data['wine_id'] >= 10) & (user_data['wine_id'] < 50)]
light_user = user_data[user_data['wine_id'] < 10]

In [None]:
heavy_user_set = set(list(heavy_user.index))
medium_user_set = set(list(medium_user.index))
light_user_set = set(list(light_user.index))

In [None]:
heavys_inter = inter[inter['email'].isin(heavy_user_set)]
mediums_inter = inter[inter['email'].isin(medium_user_set)]
lights_inter = inter[inter['email'].isin(light_user_set)]

In [None]:
def get_item_report(inter_df):
    country_df = inter_df.groupby('country').agg({
    'price': 'mean',
    'wine_id': 'count',
    'scaled_rating':'mean',
    'Red Fruit': 'mean',
    'Tropical': 'mean',
    'Tree Fruit': 'mean',
    'Oaky': 'mean',
    'Ageing': 'mean',
    'Black Fruit': 'mean',
    'Citrus': 'mean',
    'Dried Fruit': 'mean',
    'Earthy': 'mean',
    'Floral': 'mean',
    'Microbio': 'mean',
    'Spices': 'mean',
    'Vegetal': 'mean',
    'Light': 'mean',
    'Bold': 'mean',
    'Smooth': 'mean',
    'Tannic': 'mean',
    'Dry': 'mean',
    'Sweet': 'mean',
    'Soft': 'mean',
    'Acidic': 'mean',
    'Fizzy': 'mean',
    'Gentle': 'mean'
    })
    country_df = country_df[country_df['wine_id'].notna()]
    total = country_df['wine_id'].sum()
    country_df['wine_id'] = country_df['wine_id'].apply(lambda x: x/total * 100)
    country_df = country_df[country_df['wine_id'] > 1]
    country_df = country_df.sort_values(by = 'wine_id', ascending = False)
    country_df = country_df.rename(columns = {'wine_id':'percentage'})

    winetype_df = inter_df.groupby('winetype').agg({
    'price': 'mean',
    'wine_id': 'count',
    'scaled_rating':'mean',
    'Red Fruit': 'mean',
    'Tropical': 'mean',
    'Tree Fruit': 'mean',
    'Oaky': 'mean',
    'Ageing': 'mean',
    'Black Fruit': 'mean',
    'Citrus': 'mean',
    'Dried Fruit': 'mean',
    'Earthy': 'mean',
    'Floral': 'mean',
    'Microbio': 'mean',
    'Spices': 'mean',
    'Vegetal': 'mean',
    'Light': 'mean',
    'Bold': 'mean',
    'Smooth': 'mean',
    'Tannic': 'mean',
    'Dry': 'mean',
    'Sweet': 'mean',
    'Soft': 'mean',
    'Acidic': 'mean',
    'Fizzy': 'mean',
    'Gentle': 'mean'
    })
    winetype_df = winetype_df[winetype_df['wine_id'].notna()]
    total = winetype_df['wine_id'].sum()
    winetype_df['wine_id'] = winetype_df['wine_id'].apply(lambda x: x/total * 100)
    winetype_df = winetype_df[winetype_df['wine_id'] > 1]
    winetype_df = winetype_df.sort_values(by = 'wine_id', ascending = False)
    winetype_df = winetype_df.rename(columns = {'wine_id':'percentage'})
    return country_df.round(2), winetype_df.round(2)


In [None]:
inter_rose = inter[inter['winetype'] == 'roswine']

In [None]:

# price_series의 히스토그램 그리기
plt.hist(inter_rose['price'], bins=10, alpha=0.5, color='blue')
plt.title('Histogram of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
heavys_inter.groupby('winetype').agg({'price':'mean',
                                      'scaled_rating':'mean'})

In [None]:
heavys_inter.groupby('winetype').agg({'price':'std'})

In [None]:
heavy_report = heavys_inter.groupby('country').agg({'price':'mean',
                                     'wine_id':'count',
                                      'scaled_rating':'mean'})
heavy_report = heavy_report[heavy_report['wine_id'].notna()]
total = heavy_report['wine_id'].sum()
heavy_report['wine_id'] = heavy_report['wine_id'].apply(lambda x: x/total * 100)
heavy_report = heavy_report[heavy_report['wine_id'] > 1]
heavy_report = heavy_report.sort_values(by = 'wine_id', ascending = False)
heavy_report = heavy_report.rename(columns = {'wine_id':'percentage'})
heavy_report

In [None]:
mediums_inter.groupby('winetype').agg({'price':'mean',
                                      'scaled_rating':'mean'})

In [None]:
medium_report = mediums_inter.groupby('country').agg({'price':'mean',
                                     'wine_id':'count',
                                      'scaled_rating':'mean'})
medium_report = medium_report[medium_report['wine_id'].notna()]
total = medium_report['wine_id'].sum()
medium_report['wine_id'] = medium_report['wine_id'].apply(lambda x: x/total * 100)
medium_report = medium_report[medium_report['wine_id'] > 1]
medium_report = medium_report.sort_values(by = 'wine_id', ascending = False)
medium_report = medium_report.rename(columns = {'wine_id':'percentage'})
medium_report

In [None]:
lights_inter.groupby('winetype').agg({'price':'mean',
                                      'scaled_rating':'mean'})

In [None]:
light_report = lights_inter.groupby('country').agg({'price':'mean',
                                     'wine_id':'count',
                                      'scaled_rating':'mean'})
light_report = light_report[light_report['wine_id'].notna()]
total = light_report['wine_id'].sum()
light_report['wine_id'] = light_report['wine_id'].apply(lambda x: x/total * 100)
light_report = light_report[light_report['wine_id'] > 1]
light_report = light_report.sort_values(by = 'wine_id', ascending = False)
light_report = light_report.rename(columns = {'wine_id':'percentage'})
light_report

In [None]:
hvy_country, hvy_winetype = get_item_report(heavys_inter)

In [None]:
med_country, med_winetype = get_item_report(mediums_inter)

In [None]:
lgt_country, lgt_winetype = get_item_report(lights_inter)

In [None]:
lgt_country.to_csv('/opt/ml/wine/EDA/lgt_country.csv')
lgt_winetype.to_csv('/opt/ml/wine/EDA/lgt_winetype.csv')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
def get_winetype_corr(inter, user_type):
    use_cols = ['Light', 'Bold', 'Smooth', 
               'Tannic', 'Dry', 'Sweet', 
               'Soft', 'Acidic','Fizzy', 
               'Gentle','scaled_rating']
    for winetype in inter['winetype'].unique():

        winetype_inter = inter[inter['winetype'] == winetype]

        correlation_df = winetype_inter.loc[:,use_cols].corr()['scaled_rating']
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_df.to_frame(), annot=True, cmap='coolwarm', fmt='.2f')
        plt.title(f"Correlation Heatmap - {user_type}_{winetype}")
        plt.savefig(f'/opt/ml/wine/EDA/{user_type}_{winetype}_heatmap.png')

In [None]:
get_winetype_corr(lights_inter, 'light')

In [None]:
hvy_red = heavys_inter[heavys_inter['winetype'] == 'dessertwine']

In [None]:
hvy_red.loc[:, ['Light', 'Bold', 'Smooth', 'Tannic', 'Dry', 'Sweet', 'Soft', 'Acidic',
       'Fizzy', 'Gentle','scaled_rating']].corr()['scaled_rating']

In [None]:
def get_corr(df, name):
    correlation_df = df.corr()['scaled_rating']
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_df.to_frame(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Correlation Heatmap - scaled_rating")
    plt.savefig(f'/opt/ml/wine/EDA/{name}_heatmap.png')

get_corr(hvy_country, 'hvy_country')
get_corr(hvy_winetype, 'hvy_winetype')

get_corr(med_country, 'med_country')
get_corr(med_winetype, 'med_winetype')

get_corr(lgt_country, 'lgt_country')
get_corr(lgt_winetype, 'lgt_winetype')

In [None]:
heavys_item = heavys_inter.groupby('wine_id').agg({'scaled_rating':'mean'})
heavys_item = heavys_item.sort_values(by = 'scaled_rating', ascending= False)

mediums_item = mediums_inter.groupby('wine_id').agg({'scaled_rating':'mean'})
mediums_item = mediums_item.sort_values(by = 'scaled_rating', ascending= False)

lights_item = lights_inter.groupby('wine_id').agg({'scaled_rating':'mean'})
lights_item = lights_item.sort_values(by = 'scaled_rating', ascending= False)


In [None]:
heavys_items_set = set(heavys_item.index)
mediums_items_set = set(mediums_item.index)
lights_items_set = set(lights_item.index)

common_items = heavys_items_set.intersection(mediums_items_set, lights_items_set)

num_common_items = len(common_items)

# Calculate the percentage of common items
total_unique_items = len(set(list(heavys_item.index) + list(mediums_item.index) + list(lights_item.index)))
percentage_common_items = (num_common_items / total_unique_items) * 100

print(f"Number of items common to all three user groups: {num_common_items}")
print(f"Percentage of items common to all three user groups: {percentage_common_items:.2f}%")

In [None]:
heavy_common = heavys_item.loc[list(common_items)]
medium_common = mediums_item.loc[list(common_items)]
light_common = lights_item.loc[list(common_items)]

In [None]:
item_per_user_type = pd.merge(heavy_common, medium_common, on='wine_id', suffixes=('_heavy', '_medium'))
item_per_user_type = pd.merge(item_per_user_type, light_common, on='wine_id', suffixes=('', '', '_light'))

item_per_user_type.rename(columns={'scaled_rating': 'scaled_rating_light'}, inplace=True)

In [None]:
item_per_user_type

In [None]:
corr_matrix = item_per_user_type.corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Scaled Ratings for Different User Groups')
plt.show()


In [None]:
def see_diff(hvy, med, lgt, name_list):
    
    for name in name_list:
        attributes = list(hvy.loc[name].keys())

        x = np.arange(len(attributes))  # Create index for the bars
        width = 0.2  # Width of the bars

        fig, ax = plt.subplots(figsize=(15, 9))
        heavy_bars = ax.bar(x - width, list(hvy.loc[name]), width, label='Heavy', alpha=0.8)
        medium_bars = ax.bar(x, list(med.loc[name]), width, label='Medium', alpha=0.8)
        light_bars = ax.bar(x + width, list(lgt.loc[name]), width, label='Light', alpha=0.8)

        # Add labels, title, and legend
        ax.set_xlabel('Attributes')
        ax.set_ylabel('Values')
        ax.set_title(f'Comparison of {name}')
        ax.set_xticks(x)
        ax.set_xticklabels(attributes, rotation=45)
        ax.legend()
        
        plt.tight_layout()
        plt.savefig(f'/opt/ml/wine/EDA/{name}_compare.jpg')
        

see_diff(hvy_country, med_country, lgt_country, hvy_country.index)
    

In [None]:
see_diff(hvy_winetype, med_winetype, lgt_winetype, hvy_winetype.index)

In [None]:
inter_per_user = inter.groupby('email')['wine_id'].agg(list)

In [None]:
inter_per_user['user_100001.0@example.com']

In [None]:
item_data.set_index('wine_id', inplace = True)
item_data['wine_id'] = item_data.index
item_data['vectors'] = item_data['vectors'].apply(string2array)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import faiss

def get_nns(user : str,
            inter_per_user : pd.DataFrame,
            item_data: pd.DataFrame,
            index:faiss.IndexIDMap2,
            total_k: int = 15000):
    
    wine_ids = inter_per_user[user]


    vectors = item_data.loc[wine_ids, 'vectors']

    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    X = vectorizer.fit_transform(vectors).toarray()

    # Step 2: Normalize the vectors
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)

    # Determine the optimal number of clusters using the Elbow Method
    inertia = []
    max_k = len(vectors) if len(vectors) < 12 else 12
    k_range = range(1, max_k)  # Test different K values from 1 to 10
    for k in tqdm(k_range):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_normalized)
        inertia.append(kmeans.inertia_)

    # Determine the optimal K value (Elbow point)
    optimal_k = np.argmin(np.diff(inertia)) + 1

    # Step 4: Apply K-Means with the optimal K value
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    clusters = kmeans.fit_predict(X_normalized)

    mean_vectors = {}
    for vector, cluster in zip(vectors, clusters):
        if cluster not in vector:
            mean_vectors[cluster] = {'count': 1, 'mean': vector}
        else:
            mean_vectors[cluster]['count'] += 1
            mean_vectors[cluster]['mean'] += (vector - mean_vectors[cluster]['mean']) / mean_vectors[cluster]['count']

    num_cluster = len(mean_vectors)

    k = total_k // num_cluster

    result = []
    for _, to_search in tqdm(mean_vectors.items()):
        # Faiss expects the query vectors to be normalized
        to_search  = to_search['mean']
        to_search = np.expand_dims(to_search, axis=0)
        to_search = np.ascontiguousarray(to_search.astype(np.float32))

        distances, searched_wine_ids = index.search(to_search, k=k)

        for dis, id in zip(distances[0], searched_wine_ids[0]):
            result.append((id, dis))
    
    result.sort(key = lambda x: x[1])
    return result

In [None]:
inter_per_user = inter.groupby('email')['wine_id'].agg(list)

item_data.set_index('wine_id', inplace = True)
item_data['wine_id'] = item_data.index
item_data['vectors'] = item_data['vectors'].apply(string2array)

wine_vectors = []
for vector in item_data['vectors']: wine_vectors.append(vector)
wine_vectors = np.array(wine_vectors)

wine_ids = list(item_data.index) #####wine id 
vector_dimension = wine_vectors.shape[0]

index = faiss.IndexFlatIP(vector_dimension)
index = faiss.IndexIDMap2(index)
index.add_with_ids(wine_vectors, wine_ids)

In [None]:
inter['wine_id'].nunique()

In [None]:
get_nns(user = 'user_100001.0@example.com', 
        inter_per_user = inter_per_user,
        item_data = item_data,
        index = index,
        total_k = 15000)

In [None]:
item = pd.read_csv('/opt/ml/wine/dataset/train_data/train_data.item', encoding='utf-8-sig', sep = '\t')

In [None]:
rv = pd.read_csv('/opt/ml/wine/data/review_df.csv', encoding='utf-8-sig')

In [None]:
crawl = pd.read_csv('/opt/ml/wine/data/wine_df.csv', encoding = 'utf-8-sig')

In [None]:
crawl = crawl[crawl['url'].notna()]


In [None]:
items = list(crawl['url'].unique())
items.sort()



In [None]:
item2idx = {feature: index for index, feature in enumerate(items)}
idx2item = {index: feature for index, feature in enumerate(items)}
with open('/opt/ml/wine/code/data/feature_map/item2idx.json','w') as f: json.dump(item2idx,f)
with open('/opt/ml/wine/code/data/feature_map/idx2item.json','w') as f: json.dump(idx2item,f)

In [None]:
emb = pd.read_csv('/opt/ml/wine/dataset/train_data/train_data.itememb', encoding='utf-8-sig', sep = '\t')

In [None]:
emb=emb.rename(columns = {'wine_id:token':'wid:token'})

In [None]:
outpath = f"/opt/ml/wine/dataset/train_data"
emb.to_csv(os.path.join(outpath,"train_data.itememb"),sep='\t',index=False, encoding='utf-8')

In [None]:
outpath = f"/opt/ml/wine/dataset/train_data"
emb.to_csv(os.path.join(outpath,"train_data.itememb"),sep='\t',index=False, encoding='utf-8')

In [None]:
outpath = f"/opt/ml/wine/dataset/train_data"
item.to_csv(os.path.join(outpath,"train_data.item"),sep='\t',index=False, encoding='utf-8')

In [None]:
item_data.to_csv(os.path.join(outpath,"train_data.item"),sep='\t',index=False, encoding='utf-8')

In [None]:
list(item_data[(item_data['price'] > 100) & (item_data['winetype'] == 'dessertwine')]['wine_id'])

In [None]:
item_data = pd.read_csv('/opt/ml/wine/data/item_data_sample.csv', encoding='utf-8-sig')
wine_df = pd.read_csv('/opt/ml/wine/data/wine_df.csv', encoding='utf-8-sig')

In [None]:
wine_df.drop_duplicates(inplace = True)
wine_df = wine_df[wine_df['url'].notna()]

In [None]:
for col in wine_df.columns:
    if 'Unnamed' in col:
        wine_df.drop(col, axis =1, inplace = True)

In [None]:


count_only_list = [item for item in wine_df.columns if ("count" in item) & (item != 'country')]

print(count_only_list)


In [None]:
for col in count_only_list:
    wine_df[col.replace('_count','').replace('_',' ')] = wine_df[col]

In [None]:
for col in count_only_list:
    wine_df.drop(col, axis=1, inplace = True)

In [None]:
for col in wine_df.columns:
    if '_child' in col: wine_df.drop(col, axis=1, inplace = True)

In [None]:
wine_df.drop("None", axis=1, inplace = True)

In [None]:
notes_col = 'Red Fruit','Tropical', 'Tree Fruit', 'Oaky', 'Ageing', 'Black Fruit', 'Citrus','Dried Fruit', 'Earthy', 'Floral', 'Microbio', 'Spices', 'Vegetal'
wine_df.loc[:, notes_col] = wine_df.loc[:, notes_col].fillna(0)

In [None]:
wine_df['wine_id'] = wine_df['url'].map(item2idx)

In [None]:
basic_info = pd.read_csv('/opt/ml/wine/data/basic_info_total.csv', encoding='utf-8-sig')
pallete = pd.read_csv('/opt/ml/wine/data/wine_pallete_df.csv', encoding='utf-8-sig')

In [None]:
basic_info['wine_id'] = basic_info['url'].map(item2idx)
basic_info = basic_info[basic_info['wine_id'].notna()]

In [None]:
pallete['wine_id'] = pallete['url'].map(item2idx)
pallete = pallete[pallete['wine_id'].notna()]

In [None]:
pallete.fillna(0, inplace= True)

In [None]:
pallete['wine_id'] = pallete['wine_id'].astype(int).astype('category')
basic_info['wine_id'] = basic_info['wine_id'].astype(int).astype('category')
wine_df['wine_id'] = wine_df['wine_id'].astype(int).astype('category')

In [None]:
pallete.drop('url',axis = 1, inplace = True)
basic_info.drop('url',axis = 1, inplace = True)

In [None]:
basic_info['grapes'].fillna('[]', inplace = True)

In [None]:
wine_df['region1'] = wine_df['region']

In [None]:
wine_df['region2'] = None

wine_df['region3'] = None
wine_df['region4'] = None
wine_df['region5'] = None
wine_df['wine style'] = None
wine_df['alcohol content'] = None
wine_df['allergens'] = None

In [None]:
wine_df.drop(['grape','region'], axis = 1, inplace= True)

In [None]:
wine_df.set_index('wine_id', inplace= True)
basic_info.set_index('wine_id', inplace= True)

In [None]:
fill_col = ['winery', 'grapes', 'country', 'region1', 'region2', 'region3',
       'wine style', 'allergens', 'alcohol content', 'region4','region5']
for id in tqdm(wine_df.index):
    for col in fill_col:
        if col == 'grapes':
            if id in basic_info.index:
                wine_df.loc[id, 'grape'] = basic_info.loc[id, 'grapes']
            else: wine_df.loc[id, 'grape'] = '[]'
    
        elif (wine_df.loc[id, col] is None) & (id in basic_info.index):
            wine_df.loc[id, col] = basic_info.loc[id, col]


In [None]:
wine_df['grape_detail'] = wine_df['grape']

In [None]:
import ast
wine_df['grape'] = wine_df['grape'].apply(ast.literal_eval)

In [None]:
wine_df['grape_detail'] = wine_df['grape_detail'].apply(ast.literal_eval)

In [None]:
import re
def remove_non_english_chars(text):
    # 정규 표현식으로 영어 이외의 모든 문자(공백 문자 제외) 제거
    text = text.replace('_',' ')
    cleaned_text = re.sub(r'[^a-zA-Z ]', '', text)
    return cleaned_text

def regex_in_list(input_list):
    return [remove_non_english_chars(x).lower() for x in input_list]

wine_df['grape'] = wine_df['grape_detail'].apply(regex_in_list)

In [None]:

wine_df_pallete  = pd.merge(wine_df, pallete, on = 'wine_id', how = 'left')

In [None]:
pallete_col = ['Light', 'Bold', 'Smooth', 'Tannic', 'Dry', 'Sweet', 'Soft', 'Acidic','Fizzy', 'Gentle']
for col in pallete_col:
    wine_df_pallete.loc[:,col].fillna(0, inplace =True)

In [None]:
wine_df_pallete.to_csv('/opt/ml/wine/data/item_data_final.csv', index = False, encoding = 'utf-8-sig')

In [None]:
review_df = pd.read_csv('/opt/ml/wine/data/review_df_total.csv',encoding='utf-8-sig',
                        usecols=lambda col: col != 'text')


In [None]:
with open('/opt/ml/wine/code/data/feature_map/item2idx.json','r') as f:
    item2idx = json.load(f)

In [None]:
review_df['wine_id'] = review_df['wine_url'].map(item2idx)

In [None]:
from datetime import datetime
def date_to_int(x):
    try:
        date_object = datetime.strptime(x, "%b %d, %Y")

        timestamp = int(date_object.timestamp())
        return timestamp
    except: return None

    



In [None]:
review_df['timestamp'] = review_df['date'].apply(date_to_int)

In [None]:
review_df['wine_id'] = review_df['wine_id'].astype(int).astype('category')

In [None]:
review_df = review_df[review_df['user_url'].notna()]

In [None]:
user2idx = {feature: index for index, feature in enumerate(users)}
idx2user = {index: feature for index, feature in enumerate(users)}

In [None]:
users = set(review_df['user_url'].unique())

In [None]:
review_df['email'] = review_df['email'].apply(lambda x: str(x) + "@example.com")

In [None]:
review_df = review_df[review_df['wine_id'].notna()]
review_df['wine_id'] = review_df['wine_id'].astype(int).astype('category')

In [64]:
item_data = pd.read_csv('/opt/ml/wine/data/item_data.csv',encoding='utf-8-sig')


In [65]:
item_data['vectors'][0][-19:]

' 0.2325901836156845'

In [61]:
item_data['vectors'] = item_data['vectors'].apply(string2array)

In [62]:
wine_vectors = []
for vector in item_data['vectors']: wine_vectors.append(vector)
wine_vectors = np.array(wine_vectors)

In [63]:
wine_vectors.shape

(71348, 768)

In [None]:
review_df.to_csv('/opt/ml/wine/data/inter.csv',index= False, encoding = 'utf-8-sig')

In [None]:
sam = pd.read_csv('/opt/ml/wine/Recbole/item_data_js.csv',encoding='utf-8-sig')

In [None]:
sam['vectors'] = sam['vectors'].apply(string2array)
wine_vectors = []
for vector in sam['vectors']: wine_vectors.append(vector)
wine_vectors = np.array(wine_vectors)


In [59]:
wine_vectors.shape

(5, 512)