# Art

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score, confusion_matrix
import warnings
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, BaseCrossValidator, cross_val_predict

In [2]:
path = './data/preprocessed_ver2_art.csv'
df = pd.read_csv(path)

In [3]:
df.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df['price']
df.drop(labels=['price'], axis=1,inplace = True)
df.insert(0, 'price', price)
date = df['date']
df.drop(labels=['date', 'author'], axis=1, inplace = True)
df.insert(1, 'date', date)
df['date'] = pd.to_datetime(df['date'])

In [4]:
df = df.drop(df.query(f"date >= '2021-01-01'").index)
df = df.drop(df.query(f"date < '2016-01-01'").index)

In [5]:
label = df['classify_3']
df.drop(labels=['classify_3'], axis=1, inplace = True)
df.insert(3, 'label', label)

In [6]:
feature = df.columns[4:]
feature

Index(['year', 'genre', 'source', 'area', 'online', 'base_charcoal',
       'base_paper', 'base_fiber', 'base_canvas', 'base_hardboard',
       'base_hanji', 'base_ceramic', 'base_terracotta', 'base_masonite',
       'base_wood', 'base_leaf', 'base_silverpaper', 'base_metal',
       'mate_gouache', 'mate_maca', 'mate_magic', 'mate_signpen',
       'mate_colorpencil', 'mate_korean_ink', 'mate_korean_ink_and_color',
       'mate_watercolor', 'mate_pencil', 'mate_oil_and_color', 'mate_ink',
       'mate_color', 'mate_pen', 'mate_mixture', 'mate_pigment',
       'mate_graphite', 'mate_acrylic', 'mate_conte', 'mate_collage',
       'mate_engrave', 'mate_crayon', 'mate_enamel', 'mate_oilpastel', 'award',
       'exhbn_solo', 'exhbn_group', 'era', 'is_death', 'elapsed', 'height',
       'width', 'author_age'],
      dtype='object')

In [7]:
df_0 = df[df['label']==0]
df_1 = df[df['label']==1]
df_2 = df[df['label']==2]
df_3 = df[df['label']==3]
df_list = [df_0, df_1, df_2, df_3, df]

In [8]:
from sklearn.metrics import classification_report

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    target_names = ['Class 0', 'Class 1', 'Class 2', 'Class 3']
    print(classification_report(y_test, pred, target_names=target_names))
    print('오차 행렬')
    print(confusion)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
rs = 42
rf_clf = RandomForestClassifier(random_state=rs, n_jobs=-1, oob_score=True, criterion='gini')
lr_clf = LogisticRegression(random_state=rs)
gbc_model = GradientBoostingClassifier(random_state=rs)

train_clf_data = df[feature]
train_clf_target = df['label']
feature
# lr_param_grid = {'C': [0.01, 0.1, 1, 10, 100],
#               'penalty': ["l1"],
#               'solver': ['liblinear', 'saga'],
#               'multi_class':['multinomial'],
#               'max_iter':[100,300,500,1000]}
# grid_search = GridSearchCV(lr_clf, lr_param_grid, scoring='accuracy', n_jobs=1, cv=10)
# grid_search.fit(data, target)
# print('test accuracy : ', grid_search.best_score_)
# print('최적의 parameters : ', grid_search.best_params_)

Index(['year', 'genre', 'source', 'area', 'online', 'base_charcoal',
       'base_paper', 'base_fiber', 'base_canvas', 'base_hardboard',
       'base_hanji', 'base_ceramic', 'base_terracotta', 'base_masonite',
       'base_wood', 'base_leaf', 'base_silverpaper', 'base_metal',
       'mate_gouache', 'mate_maca', 'mate_magic', 'mate_signpen',
       'mate_colorpencil', 'mate_korean_ink', 'mate_korean_ink_and_color',
       'mate_watercolor', 'mate_pencil', 'mate_oil_and_color', 'mate_ink',
       'mate_color', 'mate_pen', 'mate_mixture', 'mate_pigment',
       'mate_graphite', 'mate_acrylic', 'mate_conte', 'mate_collage',
       'mate_engrave', 'mate_crayon', 'mate_enamel', 'mate_oilpastel', 'award',
       'exhbn_solo', 'exhbn_group', 'era', 'is_death', 'elapsed', 'height',
       'width', 'author_age'],
      dtype='object')

In [None]:
rf_params = {
    'n_estimators':[100, 200,300,500],
    'max_depth':[4, 8,12,16],
    'min_samples_leaf':[8,16,32],
    'min_samples_split':[8,16,32],
}

grid_cv = GridSearchCV(rf_clf, param_grid=rf_params, scoring="accuracy", cv=10, n_jobs=-1)
grid_cv.fit(train_clf_data, train_clf_target)
print('최적 하이퍼 파라미터 : \n', grid_cv.best_params_)
print('최고 예측 정확도 평가 :{0:.4f}'.format(grid_cv.best_score_))

In [None]:
param_test = { "n_estimators": [100, 200, 300,500], "max_depth": [4, 8,12,16], "learning_rate": [0.01, 0.01, 0.1]}
gsearch = GridSearchCV(estimator=gbc_model, param_grid=param_test, scoring="accuracy", n_jobs=-1, cv=10)
gsearch.fit(train_clf_data, train_clf_target)

print("Best CV Score", gsearch.best_score_)
print("Best Params", gsearch.best_params_)

In [10]:
df_test = pd.read_csv(path)

df_test.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df_test['price']
df_test.drop(labels=['price'], axis=1,inplace = True)
df_test.insert(0, 'price', price)
date = df_test['date']
df_test.drop(labels=['date', 'author'], axis=1, inplace = True)
df_test.insert(1, 'date', date)
df_test['date'] = pd.to_datetime(df_test['date'])
df_test = df_test.drop(df_test.query(f"date < '2021-01-01'").index)

label = df_test['classify_3']
df_test.drop(labels=['classify_3'], axis=1, inplace = True)
df_test.insert(3, 'label', label)

feature = df.columns[3:]
test_data = df_test[feature]
test_target = df_test['label']

In [None]:
# logistic_train_model = LogisticRegression(**grid_search.best_params_, random_state=rs)
randomforest_train_model = RandomForestClassifier(**grid_cv.best_params_, random_state=rs, n_jobs=-1)
gradientboost_train_model = GradientBoostingClassifier(**gsearch.best_params_, random_state=rs)

# logistic_train_model.fit(data, target)
randomforest_train_model.fit(train_clf_data, train_clf_target)
rf_pred = randomforest_train_model.predict(test_data)
rf_proba = randomforest_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, rf_pred, rf_proba)
print(f"Accuracy : {accuracy_score(test_target, rf_pred)}")

gradientboost_train_model.fit(train_clf_data, train_clf_target)

model_predict = gradientboost_train_model.predict(test_data)
model_proba = gradientboost_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, model_predict, model_proba)
print(f"Accuracy : {accuracy_score(test_target, model_predict)}")

## Art and Economy

In [None]:
path = './data/preprocessed_ver2_economics.csv'
df = pd.read_csv(path)

df.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df['price']
df.drop(labels=['price'], axis=1,inplace = True)
df.insert(0, 'price', price)
date = df['date']
df.drop(labels=['date', 'author'], axis=1, inplace = True)
df.insert(1, 'date', date)
df['date'] = pd.to_datetime(df['date'])

df = df.drop(df.query(f"date >= '2021-01-01'").index)
df = df.drop(df.query(f"date < '2016-01-01'").index)

label = df['classify_3']
df.drop(labels=['classify_3'], axis=1, inplace = True)
df.insert(3, 'label', label)

feature = df.columns[4:]
feature

df_0 = df[df['label']==0]
df_1 = df[df['label']==1]
df_2 = df[df['label']==2]
df_3 = df[df['label']==3]
df_list = [df_0, df_1, df_2, df_3, df]

rs = 42
rf_clf = RandomForestClassifier(random_state=rs, n_jobs=-1, oob_score=True, criterion='gini')
lr_clf = LogisticRegression(random_state=rs)
gbc_model = GradientBoostingClassifier(random_state=rs)

train_clf_data = df[feature]
train_clf_target = df['label']
feature

rf_params = {
    'n_estimators':[100, 200,300,500],
    'max_depth':[4, 8,12,16],
    'min_samples_leaf':[8,16,32],
    'min_samples_split':[8,16,32],
}

grid_cv = GridSearchCV(rf_clf, param_grid=rf_params, scoring="accuracy", cv=10, n_jobs=-1)
grid_cv.fit(train_clf_data, train_clf_target)
print('최적 하이퍼 파라미터 : \n', grid_cv.best_params_)
print('최고 예측 정확도 평가 :{0:.4f}'.format(grid_cv.best_score_))

param_test = { "n_estimators": [100, 200, 300,500], "max_depth": [4, 8,12,16], "learning_rate": [0.01, 0.01, 0.1]}
gsearch = GridSearchCV(estimator=gbc_model, param_grid=param_test, scoring="accuracy", n_jobs=-1, cv=10)
gsearch.fit(train_clf_data, train_clf_target)

print("Best CV Score", gsearch.best_score_)
print("Best Params", gsearch.best_params_)

df_test = pd.read_csv(path)

df_test.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df_test['price']
df_test.drop(labels=['price'], axis=1,inplace = True)
df_test.insert(0, 'price', price)
date = df_test['date']
df_test.drop(labels=['date', 'author'], axis=1, inplace = True)
df_test.insert(1, 'date', date)
df_test['date'] = pd.to_datetime(df_test['date'])
df_test = df_test.drop(df_test.query(f"date < '2021-01-01'").index)

label = df_test['classify_3']
df_test.drop(labels=['classify_3'], axis=1, inplace = True)
df_test.insert(3, 'label', label)

feature = df.columns[3:]
test_data = df_test[feature]
test_target = df_test['label']

randomforest_train_model = RandomForestClassifier(**grid_cv.best_params_, random_state=rs, n_jobs=-1)
gradientboost_train_model = GradientBoostingClassifier(**gsearch.best_params_, random_state=rs)

randomforest_train_model.fit(train_clf_data, train_clf_target)
rf_pred = randomforest_train_model.predict(test_data)
rf_proba = randomforest_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, rf_pred, rf_proba)
print(f"Accuracy : {accuracy_score(test_target, rf_pred)}")

gradientboost_train_model.fit(train_clf_data, train_clf_target)

model_predict = gradientboost_train_model.predict(test_data)
model_proba = gradientboost_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, model_predict, model_proba)
print(f"Accuracy : {accuracy_score(test_target, model_predict)}")

## Art and Trend

In [None]:
path = './data/preprocessed_ver2_trend.csv'
df = pd.read_csv(path)

df.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df['price']
df.drop(labels=['price'], axis=1,inplace = True)
df.insert(0, 'price', price)
date = df['date']
df.drop(labels=['date', 'author'], axis=1, inplace = True)
df.insert(1, 'date', date)
df['date'] = pd.to_datetime(df['date'])

df = df.drop(df.query(f"date >= '2021-01-01'").index)
df = df.drop(df.query(f"date < '2016-01-01'").index)

label = df['classify_3']
df.drop(labels=['classify_3'], axis=1, inplace = True)
df.insert(3, 'label', label)

feature = df.columns[4:]
feature

df_0 = df[df['label']==0]
df_1 = df[df['label']==1]
df_2 = df[df['label']==2]
df_3 = df[df['label']==3]
df_list = [df_0, df_1, df_2, df_3, df]

rs = 42
rf_clf = RandomForestClassifier(random_state=rs, n_jobs=-1, oob_score=True, criterion='gini')
lr_clf = LogisticRegression(random_state=rs)
gbc_model = GradientBoostingClassifier(random_state=rs)

train_clf_data = df[feature]
train_clf_target = df['label']
feature

rf_params = {
    'n_estimators':[100, 200,300,500],
    'max_depth':[4, 8,12,16],
    'min_samples_leaf':[8,16,32],
    'min_samples_split':[8,16,32],
}

grid_cv = GridSearchCV(rf_clf, param_grid=rf_params, scoring="accuracy", cv=10, n_jobs=-1)
grid_cv.fit(train_clf_data, train_clf_target)
print('최적 하이퍼 파라미터 : \n', grid_cv.best_params_)
print('최고 예측 정확도 평가 :{0:.4f}'.format(grid_cv.best_score_))

param_test = { "n_estimators": [100, 200, 300,500], "max_depth": [4, 8,12,16], "learning_rate": [0.01, 0.01, 0.1]}
gsearch = GridSearchCV(estimator=gbc_model, param_grid=param_test, scoring="accuracy", n_jobs=-1, cv=10)
gsearch.fit(train_clf_data, train_clf_target)

print("Best CV Score", gsearch.best_score_)
print("Best Params", gsearch.best_params_)

df_test = pd.read_csv(path)

df_test.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df_test['price']
df_test.drop(labels=['price'], axis=1,inplace = True)
df_test.insert(0, 'price', price)
date = df_test['date']
df_test.drop(labels=['date', 'author'], axis=1, inplace = True)
df_test.insert(1, 'date', date)
df_test['date'] = pd.to_datetime(df_test['date'])
df_test = df_test.drop(df_test.query(f"date < '2021-01-01'").index)

label = df_test['classify_3']
df_test.drop(labels=['classify_3'], axis=1, inplace = True)
df_test.insert(3, 'label', label)

feature = df.columns[3:]
test_data = df_test[feature]
test_target = df_test['label']

randomforest_train_model = RandomForestClassifier(**grid_cv.best_params_, random_state=rs, n_jobs=-1)
gradientboost_train_model = GradientBoostingClassifier(**gsearch.best_params_, random_state=rs)

randomforest_train_model.fit(train_clf_data, train_clf_target)
rf_pred = randomforest_train_model.predict(test_data)
rf_proba = randomforest_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, rf_pred, rf_proba)
print(f"Accuracy : {accuracy_score(test_target, rf_pred)}")

gradientboost_train_model.fit(train_clf_data, train_clf_target)

model_predict = gradientboost_train_model.predict(test_data)
model_proba = gradientboost_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, model_predict, model_proba)
print(f"Accuracy : {accuracy_score(test_target, model_predict)}")

## Art and Economy and Trend

In [None]:
path = './data/preprocessed_ver2.csv'
df = pd.read_csv(path)

df.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df['price']
df.drop(labels=['price'], axis=1,inplace = True)
df.insert(0, 'price', price)
date = df['date']
df.drop(labels=['date', 'author'], axis=1, inplace = True)
df.insert(1, 'date', date)
df['date'] = pd.to_datetime(df['date'])

df = df.drop(df.query(f"date >= '2021-01-01'").index)
df = df.drop(df.query(f"date < '2016-01-01'").index)

label = df['classify_3']
df.drop(labels=['classify_3'], axis=1, inplace = True)
df.insert(3, 'label', label)

feature = df.columns[4:]
feature

df_0 = df[df['label']==0]
df_1 = df[df['label']==1]
df_2 = df[df['label']==2]
df_3 = df[df['label']==3]
df_list = [df_0, df_1, df_2, df_3, df]

rs = 42
rf_clf = RandomForestClassifier(random_state=rs, n_jobs=-1, oob_score=True, criterion='gini')
lr_clf = LogisticRegression(random_state=rs)
gbc_model = GradientBoostingClassifier(random_state=rs)

train_clf_data = df[feature]
train_clf_target = df['label']
feature

rf_params = {
    'n_estimators':[100, 200,300,500],
    'max_depth':[4, 8,12,16],
    'min_samples_leaf':[8,16,32],
    'min_samples_split':[8,16,32],
}

grid_cv = GridSearchCV(rf_clf, param_grid=rf_params, scoring="accuracy", cv=10, n_jobs=-1)
grid_cv.fit(train_clf_data, train_clf_target)
print('최적 하이퍼 파라미터 : \n', grid_cv.best_params_)
print('최고 예측 정확도 평가 :{0:.4f}'.format(grid_cv.best_score_))

param_test = { "n_estimators": [100, 200, 300,500], "max_depth": [4, 8,12,16], "learning_rate": [0.01, 0.01, 0.1]}
gsearch = GridSearchCV(estimator=gbc_model, param_grid=param_test, scoring="accuracy", n_jobs=-1, cv=10)
gsearch.fit(train_clf_data, train_clf_target)

print("Best CV Score", gsearch.best_score_)
print("Best Params", gsearch.best_params_)

df_test = pd.read_csv(path)

df_test.drop(['title', 'standard', 'technique', 'mate_oil'], axis=1, inplace=True)
price = df_test['price']
df_test.drop(labels=['price'], axis=1,inplace = True)
df_test.insert(0, 'price', price)
date = df_test['date']
df_test.drop(labels=['date', 'author'], axis=1, inplace = True)
df_test.insert(1, 'date', date)
df_test['date'] = pd.to_datetime(df_test['date'])
df_test = df_test.drop(df_test.query(f"date < '2021-01-01'").index)

label = df_test['classify_3']
df_test.drop(labels=['classify_3'], axis=1, inplace = True)
df_test.insert(3, 'label', label)

feature = df.columns[3:]
test_data = df_test[feature]
test_target = df_test['label']

randomforest_train_model = RandomForestClassifier(**grid_cv.best_params_, random_state=rs, n_jobs=-1)
gradientboost_train_model = GradientBoostingClassifier(**gsearch.best_params_, random_state=rs)

randomforest_train_model.fit(train_clf_data, train_clf_target)
rf_pred = randomforest_train_model.predict(test_data)
rf_proba = randomforest_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, rf_pred, rf_proba)
print(f"Accuracy : {accuracy_score(test_target, rf_pred)}")

gradientboost_train_model.fit(train_clf_data, train_clf_target)

model_predict = gradientboost_train_model.predict(test_data)
model_proba = gradientboost_train_model.predict_proba(test_data)[:,1]
get_clf_eval(test_target, model_predict, model_proba)
print(f"Accuracy : {accuracy_score(test_target, model_predict)}")