In [1]:
import numpy as np
import pandas
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import json
import requests
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# create a client instance of the library
#elastic_client = Elasticsearch(timeout=60, max_retries=10, retry_on_timeout=True)
#elastic_client.cluster.health(wait_for_status='yellow', request_timeout=55)

In [None]:
import pickle
pickle_save_path = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\elastic_df.pkl'

In [None]:
with open(pickle_save_path,'rb') as path_name:
    elastic_df = pickle.load(path_name) 

### Stat

In [None]:
## Sales feature

from scipy import stats 

elastic_df['sale_usd'].describe()

mean = np.mean(elastic_df['sale_usd'])
std = np.std(elastic_df['sale_usd'])
median = np.median(elastic_df['sale_usd'])
print('Mean of sales is', mean)
print('Std. deviation is', std)
print('Median is', median)

In [None]:
threshold = 3
outlier = []
for i in elastic_df['sale_usd']:
    z = (i-mean)/std
    if z > threshold:
        outlier.append(i)
len(outlier)        

elastic_df = elastic_df[elastic_df['sale_usd'].apply(lambda usd: usd not in outlier)]

elastic_df = elastic_df.reset_index(drop=True)

### Y-bin

In [None]:
max_usd = np.max(elastic_df['sale_usd']) + 1 # added 1 because sometimes the max value was not bined
print(f'Max price: {max_usd}')

In [None]:
plt.hist(elastic_df['sale_usd'], bins = np.logspace(start = np.log10(1), stop = np.log10(max_usd), num = 20))
plt.gca().set_xscale("log")
plt.show()

In [None]:
bins = np.logspace(start = np.log10(1), stop = np.log10(max_usd), num = 20)
bins

In [None]:
def bin_number(price):
    for i in range(0, len(bins)):
        start = 0
        if i > 0:
            start = bins[i-1]
        end = bins[i]
        if price > start and price <= end:
            return int(i) 
        
elastic_df['bin_log_usd'] = elastic_df['sale_usd'].apply(bin_number)

In [None]:
bin_labels = []
for i in range(0, len(bins)):
    start = 0
    if i > 0:
        start = bins[i-1]
    end = bins[i]
    s = '{:.1f}'.format(start)
    e = '{:.1f}'.format(end)
    bin_labels.append(f'{s}-{e}')
bin_labels

In [None]:
# Creating histogram
fig, ax = plt.subplots(1, 1, figsize=(30,20))
ax.hist(elastic_df['bin_log_usd'], bins=len(bin_labels), align='mid')
# Set title
ax.set_title("USD")
# adding labels
ax.set_xlabel('x-label')
ax.set_ylabel('y-label')
# Make some labels.

#range(len(bin_labels)
rects = ax.patches
for rect, label in zip(rects, bin_labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height+0.01, label,
            ha='center', va='bottom', fontsize=20)
plt.xticks(fontsize=0)
# Show plot
plt.show()

In [None]:
elastic_df['cat_usd'] = elastic_df['bin_log_usd'].astype('category')

In [None]:
idx_to_class = {
    idx: class_
    for idx, class_ in enumerate(bin_labels)
}
class_to_idx = {bin_labels[i]: i for i in range(len(bin_labels))}

class_to_idx
# Restart for ResNet
# Step 3

### Split on X&Y data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
Y = elastic_df['cat_usd'].copy()
X = elastic_df[['contract_scheme', 
                'sale_time', 'collection_created_year', 'unique_asset', 'instagram_account',
                'twitter_account', 'name_tok', 'creator_tok', 'collection_name_tok', 'instagram_tok', 
                'twitter_tok', 'id', 'preview_path', 'img_path',
                'word_count_coll_desc', 'word_count_descr', 'z_twitter_follower']]

In [None]:
cat_col = ['contract_scheme', 'unique_asset', 'instagram_account',
                'twitter_account', 'name_tok', 'creator_tok', 'collection_name_tok', 'instagram_tok', 
                'twitter_tok']

In [None]:
for category in cat_col:
    X[category] = X[category].astype('category')

In [None]:
num_col = ['sale_time', 'collection_created_year', 'word_count_coll_desc', 'word_count_descr', 'z_twitter_follower']

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled = scaler.fit_transform(X[num_col])
for i, col in enumerate(X[num_col].columns):
    newCol = []
    for ii in range(0, len(scaled)):
        newCol.append(scaled[ii][i])
    X[col] = newCol
X

In [None]:
X['id'] = elastic_df['id']
X['preview_path'] = elastic_df['preview_path']
X['img_path'] = elastic_df['img_path']

### Textual cat features transform

In [None]:
def top_cat_values(cat, number):
    #tops = cat.value_counts()[:number].index.tolist()
    tops = cat.value_counts()[:number - 1].index.tolist()
    default_cat = number - 1
    #[f(x) if condition else g(x) for x in sequence]
    arr = [tops.index(x) if x in tops else default_cat for x in cat]
    return pandas.Series(arr).astype('category')


In [None]:
X['instagram_tok'] = top_cat_values(X['instagram_tok'], 4096)
X['name_tok'] = top_cat_values(X['name_tok'], 4096)
X['collection_name_tok'] = top_cat_values(X['collection_name_tok'], 4096)
X['twitter_tok'] = top_cat_values(X['twitter_tok'], 4096)
X['creator_tok'] = top_cat_values(X['creator_tok'], 4096)

In [None]:
categorical_column_sizes = [len(X[column].cat.categories) for column in cat_col]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
print(categorical_embedding_sizes)

### Splitting

In [None]:
train_size = 0.5

X_train, X_rem, y_train, y_rem = train_test_split(X,Y, train_size = train_size, random_state=69)

# Now since we want the valid and test size to be equal (20% each of overall data). 
# we have to define valid_size = 0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size = 0.5, random_state=42)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

### Prep tab data

In [None]:
def stack_cat(dataframe):
    cat_col = ['contract_scheme', 'unique_asset', 'instagram_account', 'twitter_account']
    cat_col2 = ['name_tok', 'creator_tok', 'collection_name_tok', 'instagram_tok', 'twitter_tok']
    arr = []
    for col in cat_col:
        arr.append(dataframe[col].cat.codes.values)
    for col in cat_col2:
        arr.append(dataframe[col].values)
        
    stack_col = np.stack(arr, 1)
    return torch.tensor(stack_col, dtype=torch.int64)

In [None]:
def stack_num(dataframe):
    num_col = ['sale_time', 'collection_created_year', 'word_count_coll_desc', 'word_count_descr', 'z_twitter_follower']
    arr = []
    for col in num_col:
        arr.append(dataframe[col].values)
    stack_col = np.stack(arr, 1)
    
    return torch.tensor(stack_col, dtype=torch.float)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

In [None]:
train_cat = stack_cat(X_train).to(device)
train_num = stack_num(X_train).to(device)

#del X_train

valid_cat = stack_cat(X_valid).to(device)
valid_num = stack_num(X_valid).to(device)

#del X_valid

test_cat = stack_cat(X_test).to(device)
test_num = stack_num(X_test).to(device)

#del X_test

In [None]:
def output_prep(dataframe_y):
    dataframe_y = torch.tensor(dataframe_y).flatten()
    return dataframe_y    

y_train = output_prep(y_train.values).to(device)
y_valid = output_prep(y_valid.values).to(device)
y_test = output_prep(y_test.values).to(device)

### Pickle 

In [None]:
## Pickle 

Save

pickle_save_path_X_train = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\X_train.pkl'
with open(pickle_save_path_X_train,'wb') as path_name:
    pickle.dump(X_train, path_name) 
    
pickle_save_path_y_train = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\y_train.pkl'
with open(pickle_save_path_y_train,'wb') as path_name:
    pickle.dump(y_train, path_name)
    
pickle_save_path_X_valid = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\X_valid.pkl'
with open(pickle_save_path_X_valid,'wb') as path_name:
    pickle.dump(X_valid, path_name)
    
pickle_save_path_y_valid = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\y_valid.pkl'
with open(pickle_save_path_y_valid,'wb') as path_name:
    pickle.dump(y_valid, path_name)
    
pickle_save_path_X_test = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\X_test.pkl'
with open(pickle_save_path_X_test,'wb') as path_name:
    pickle.dump(X_test, path_name)
    
pickle_save_path_y_test = 'D:\\Code\\datascience\\MA_NFT\\data\\pickle\\y_test.pkl'
with open(pickle_save_path_y_test,'wb') as path_name:
    pickle.dump(y_test, path_name) 