In [202]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
from scipy import stats
import requests
from requests import get
from bs4 import BeautifulSoup
import xgboost as xgb
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import absolute
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [203]:
df_tr = pd.read_csv('transaction.csv', header = None, index_col = 0)
df_tr.columns = ['type', 'from', 'to', 'date', 'punk_id', 'amount_in_eth', 'amount_in_dol']
df_tr = df_tr[ ['punk_id'] + [ col for col in df_tr.columns if col != 'punk_id' ] ]
df_tr.head()

Unnamed: 0_level_0,punk_id,type,from,to,date,amount_in_eth,amount_in_dol
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,Bid Withdrawn,0x983ace,,2021-05-10,250.0,1030000.0
1,0,Bid,0x983ace,,2021-04-11,250.0,537615.0
2,0,Bid Withdrawn,0xd7510a,,2021-03-03,100.0,160973.0
3,0,Bid,0xd7510a,,2021-02-20,100.0,188897.0
4,0,Bid Withdrawn,natealex,,2020-09-16,69.0,25532.0


In [204]:
df_tr['punk_id'] = df_tr['punk_id'].astype('category')
df_tr['type'] = df_tr['type'].astype('category')
df_tr['from'] = df_tr['from'].astype(str)
df_tr['to'] = df_tr['to'].astype(str)
df_tr['date'] = pd.to_datetime(df_tr['date'], format = '%Y.%m.%d')

In [205]:
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].astype(str)
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('Y', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('Z', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('M', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('T', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('B', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].replace(',', '', regex=True)
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].astype(float)

df_tr['amount_in_dol'] = df_tr['amount_in_dol'].astype(str)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('Y', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('B', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('T', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('P', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('Z', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].replace('<', '', regex=True)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].replace(',', '', regex=True)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].astype(float)

In [206]:
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].fillna(0)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].fillna(0)

In [207]:
df_acc = pd.read_csv('accessories.csv', header=None, sep='\n')
df_acc = df_acc[0].str.split(',', expand=True)
df_acc.drop(0, axis=1, inplace=True)
df_acc.head()

Unnamed: 0,1,2,3,4,5,6,7,8
0,Female,Green Eye Shadow,Earring,Blonde Bob,,,,
1,Male,Smile,Mohawk,,,,,
2,Female,Wild Hair,,,,,,
3,Male,Wild Hair,Nerd Glasses,Pipe,,,,
4,Male,Big Shades,Wild Hair,Earring,Goat,,,


In [208]:
dict={1:'gender', 
      2:  'acc_1',
      3:'acc_2', 
      4: 'acc_3',
      5:'acc_4',
      6: 'acc_5',
      7:'acc_6',
      8:'acc_7'}
df_acc.rename(columns=dict,
          inplace=True)
  
df_acc.head()

Unnamed: 0,gender,acc_1,acc_2,acc_3,acc_4,acc_5,acc_6,acc_7
0,Female,Green Eye Shadow,Earring,Blonde Bob,,,,
1,Male,Smile,Mohawk,,,,,
2,Female,Wild Hair,,,,,,
3,Male,Wild Hair,Nerd Glasses,Pipe,,,,
4,Male,Big Shades,Wild Hair,Earring,Goat,,,


# Transactions

In [209]:
df_tr = df_tr[df_tr['type'].str.contains("Bid") | df_tr['type'].str.contains("Sold")]
df_tr = df_tr[~df_tr.type.str.contains("Withdrawn")] 
df_tr = df_tr.drop(['from', 'to', 'date', 'type'], axis=1)

df_tr = df_tr.groupby(['punk_id'])['amount_in_eth', 'amount_in_dol'].mean().reset_index()

df_tr

  df_tr = df_tr.groupby(['punk_id'])['amount_in_eth', 'amount_in_dol'].mean().reset_index()


Unnamed: 0,punk_id,amount_in_eth,amount_in_dol
0,0,29.910455,35933.818182
1,1,17.956667,8037.466667
2,2,4.655714,2023.214286
3,3,3.890000,1304.272727
4,4,6.250000,12092.166667
...,...,...,...
9995,9995,19.016667,44053.333333
9996,9996,0.525000,716.500000
9997,9997,69.071818,142668.818182
9998,9998,27.666667,36981.333333


# Accessories

In [210]:
# helper functions
def fix_eth(x):
  x=str(x)
  x=x.replace('Ξ', '')
  x=x.replace('<', '')
  x=x.replace(',', '')
  if 'K' in x:
    x=float(x.replace('K', ''))
    x=x*1000
    return x
  return x

In [211]:
page = requests.get("https://www.larvalabs.com/cryptopunks/attributes")
soup = BeautifulSoup(page.text, "html.parser")

tr_hist = soup.find_all('table')
df_acc_att = pd.read_html(str(tr_hist))[1]

df_acc_att = pd.DataFrame(df_acc_att.to_records())
df_acc_att.columns
dict={'index':'index',
      "('Attributes', 'Attribute')":'attributes',
      "('Unnamed: 1_level_0', '#')":'number',
      "('Unnamed: 2_level_0', 'Avail')":'avail',
      "('Unnamed: 3_level_0', 'Avg Sale')":"avg_sale",
      "('Unnamed: 4_level_0', 'Cheapest')":'cheapest',
      "('Unnamed: 5_level_0', 'More Examples')":'more_examples'}
df_acc_att.rename(columns=dict,
          inplace=True)
df_acc_att.columns

df_acc_att.avg_sale=df_acc_att.avg_sale.apply(fix_eth)
df_acc_att.cheapest=df_acc_att.cheapest.apply(fix_eth)

df_acc_att.avg_sale = df_acc_att.avg_sale.astype(np.float32)
df_acc_att.cheapest = df_acc_att.cheapest.astype(np.float32)
df_acc_att.attributes = df_acc_att.attributes.astype('category')
df_acc_att.drop('more_examples', axis=1, inplace=True)
df_acc_att.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   index       87 non-null     int64   
 1   attributes  87 non-null     category
 2   number      87 non-null     int64   
 3   avail       87 non-null     int64   
 4   avg_sale    87 non-null     float32 
 5   cheapest    87 non-null     float32 
dtypes: category(1), float32(2), int64(3)
memory usage: 5.7 KB


In [212]:
list_att = df_acc_att.attributes
list_att

0              Beanie
1              Choker
2        Pilot Helmet
3               Tiara
4         Orange Side
           ...       
82               Mole
83    Purple Lipstick
84       Hot Lipstick
85          Cigarette
86            Earring
Name: attributes, Length: 87, dtype: category
Categories (87, object): ['3D Glasses', 'Bandana', 'Beanie', 'Big Beard', ..., 'Welding Goggles', 'Wild Blonde', 'Wild Hair', 'Wild White Hair']

In [213]:
g = pd.DataFrame(columns=list_att)

g.rename(columns={'index':'number'},
          inplace=True)
g

attributes,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,Pink With Hat,Top Hat,...,Regular Shades,Horned Rim Glasses,Big Shades,Nerd Glasses,Black Lipstick,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring


In [214]:
x=g.copy()

In [215]:
for counter in range(df_acc.shape[0]):
  for i in g:
    if df_acc.loc[counter,:].str.contains(i).any():
      # print(counter, ' contains ', i)
      x.loc[counter,i] = 1



x.to_csv('accessories_transformed.csv', mode='w', index=False)

In [216]:
x = x.fillna(0)
x

attributes,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,Pink With Hat,Top Hat,...,Regular Shades,Horned Rim Glasses,Big Shades,Nerd Glasses,Black Lipstick,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [217]:
types = df_acc['gender']
types

0       Female
1         Male
2       Female
3         Male
4         Male
         ...  
9995    Female
9996      Male
9997    Zombie
9998    Female
9999    Female
Name: gender, Length: 10000, dtype: object

In [218]:
types_dummies = pd.get_dummies(types)
types_dummies

Unnamed: 0,Alien,Ape,Female,Male,Zombie
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
9995,0,0,1,0,0
9996,0,0,0,1,0
9997,0,0,0,0,1
9998,0,0,1,0,0


# Simple Linear Regression

In [219]:
df = df_tr.join(x, on=df_tr.index, how='left', lsuffix='_', rsuffix='_')
df = df.join(types_dummies, on=df_tr.index, how='left', lsuffix='_', rsuffix='_')
df

Unnamed: 0,punk_id,amount_in_eth,amount_in_dol,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,...,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring,Alien,Ape,Female,Male,Zombie
0,0,29.910455,35933.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,0,1,0,0
1,1,17.956667,8037.466667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
2,2,4.655714,2023.214286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
3,3,3.890000,1304.272727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
4,4,6.250000,12092.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,19.016667,44053.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
9996,9996,0.525000,716.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,1,0
9997,9997,69.071818,142668.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1
9998,9998,27.666667,36981.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0


In [228]:
df['amount_in_eth'] = df['amount_in_eth'].fillna(0)
df.to_csv('avg_price.csv', mode='w', index=False)

In [220]:
y = df['amount_in_eth']
y.sort_values(ascending=False)

3100    644.497500
7804    345.929189
2066    333.800000
6089    329.949333
3443    320.036452
           ...    
7841      0.000000
5658      0.000000
1792      0.000000
4067      0.000000
4849      0.000000
Name: amount_in_eth, Length: 10000, dtype: float64

In [221]:
X = df.iloc[:,3:]
X = X.fillna(0)
X = X.astype(int)
X

Unnamed: 0,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,Pink With Hat,Top Hat,...,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring,Alien,Ape,Female,Male,Zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [224]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [183]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
Beanie,2.137130e+01
Choker,5.882755e+00
Pilot Helmet,7.054829e+00
Tiara,6.137795e+00
Orange Side,1.301302e+01
...,...
Alien,2.560074e+13
Ape,2.560074e+13
Female,2.560074e+13
Male,2.560074e+13


In [225]:
y_pred = regressor.predict(X_test)
df_check = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_check

Unnamed: 0,Actual,Predicted
9394,0.900000,13.589844
898,7.590000,4.726562
2398,5.270000,5.371094
5906,0.000000,3.160156
2343,2.550000,7.316406
...,...,...
4004,15.512500,11.496094
7375,1.415000,2.296875
9307,8.990000,3.640625
8394,16.806250,7.046875


In [226]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 8.453982026262405
Mean Squared Error: 221.99539966217793
Root Mean Squared Error: 14.899510047722305
