<a href="https://colab.research.google.com/github/darkmochalover/DS_TermProject_AL2/blob/main/wandb_hp_baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dictionary

In [1]:
# Data Path
raw_data_path = "/content/drive/MyDrive/DS_TermProject/data/combined_mbti_df.csv"

audio_qualities = [
    'danceability_mean',
    'valence_mean',
    'energy_mean',
    'loudness_mean',
    'acousticness_mean',
    'instrumentalness_mean',
    'liveness_mean',
]

# 장조/단조 (Major/Minor)
all_tones = [
    'Cminor_count', 'CMajor_count', 'C#/Dbminor_count', 'C#/DbMajor_count',
    'DMajor_count', 'D#_EbMajor_count', 'Eminor_count', 'EMajor_count',
    'Fminor_count', 'FMajor_count', 'F#/Gbminor_count', 'GMajor_count',
    'G#/Abminor_count', 'G#/AbMajor_count', 'Aminor_count', 'AMajor_count',
    'A#/Bbminor_count', 'BMajor_count', 'Dminor_count', 'D#_Ebminor_count',
    'Gminor_count', 'A#/BbMajor_count', 'F#/GbMajor_count', 'Bminor_count'
]

major_tones = [
    'CMajor_count', 'C#/DbMajor_count',
    'DMajor_count', 'D#_EbMajor_count', 
    'EMajor_count',
    'FMajor_count', 
    'GMajor_count', 'G#/AbMajor_count', 
    'AMajor_count', 'BMajor_count', 'A#/BbMajor_count', 
    'F#/GbMajor_count'
]
minor_tones = [
    'Cminor_count', 'C#/Dbminor_count', 
    'Eminor_count', 
    'Fminor_count', 'F#/Gbminor_count', 
    'G#/Abminor_count',  
    'Aminor_count', 'A#/Bbminor_count', 
    'Dminor_count', 'D#_Ebminor_count',
    'Gminor_count', 
    'Bminor_count'
]


# 열 이름을 바꾸기 위해 리스트로 저장함
renamed_columns =  [
    'danceability',
    'valence',
    'energy',
    'loudness',
    'acousticness',
    'instrumentalness',
    'liveness'
]


Install WandB

In [2]:
!pip install -qU wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

import itertools

import wandb
import random

Step 1: Import W&B and Login

In [4]:
import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint, WandbEvalCallback

Data Loading

In [7]:
df = pd.read_csv(raw_data_path)

Feature Reduction

In [8]:
# Subset only measures of centers
X = df.iloc[: , :22] # 처음 22개 열을 선택해서 저장
X = df[audio_qualities] # 오디오 품질과 해당 열이 있는 열을 선택해서 저장


categories = renamed_columns[:]
X.columns = renamed_columns


# 장조/단조의 개수의 합을 계산해서 저장 (C장조, D단조, .. 이렇게 따로 계산되는거 말고, 위에 지정된 list 이용해서 sum값 넣어줌)
X['major_count'] = df[major_tones].sum(axis=1).astype('int64')
X['minor_count'] = df[minor_tones].sum(axis=1).astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['major_count'] = df[major_tones].sum(axis=1).astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['minor_count'] = df[minor_tones].sum(axis=1).astype('int64')


## Encoding & Scaling Function

In [9]:
def Encoding(df, encoding_method):
    df = df.copy()

    if(encoding_method == 'LabelEncoder'):
        encoder = LabelEncoder()
        target = encoder.fit_transform(df[['mbti']])
        

    if(encoding_method == 'OneHotEncoder'):
        encoder = OneHotEncoder(sparse=False)
        target = encoder.fit_transform(df[['mbti']])

    return target

def Scaling(scale_method, X_train, X_test):
    if( scale_method == 'No Scale'):
        return X_train, X_test

    elif(scale_method == 'StandardScaler'):
        scaler = StandardScaler()

    elif(scale_method == 'MinMaxScaler'):
        scaler = MinMaxScaler()

    elif(scale_method == 'RobustScaler'):
        scaler = RobustScaler()


    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    return X_train_scaled, X_test_scaled


바꿔가면서 돌리기

In [10]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [11]:
import shutil
# os.rmdir('/content/wandb')
shutil.rmtree('/content/wandb', ignore_errors=True)

In [12]:
base_config = {
    'encoder' : 'LabelEncoder',
    'scaler' : 'StandardScaler',
    'test_size' : 0.2,
    "do_smote" : 'True'
}

In [13]:
encoder_list = ['LabelEncoder', 'OneHotEncoder']
scaler_list = ['StandardScaler', 'MinMaxScaler', 'RobustScaler', 'No Scale']
test_size_list = [0.3, 0.2, 0.1]
do_smote = ['True', 'False']
model_list = ['DecisionTreeClassifier']
SearchMethods = ['Base', 'Grid', 'Random']

In [14]:
# grid search(hyperparameter)
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# random search(hyperparameter)
param_dist = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold

combinations = list(itertools.product(encoder_list, scaler_list, do_smote, model_list, SearchMethods))



In [None]:
for encoder, scaler, do_smote, model, search_method in combinations:
  # wandb.init(project="MBTI_playlist", config=base_config)

  config = {
            'encoder' : encoder,
            'scaler' : scaler,
            'test_size' : 0.3,
            "do_smote" : do_smote,
            'model' : model,
            'search_method' : search_method
  }

  wandb.init(project='MBTI_playlist_test006', entity='ds_2023_spring', config=config)

  config = wandb.config
  wandb.config.update(config)
  print(config)


  y = Encoding(df = df, encoding_method=encoder)
  # print(y[:5])

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 5, test_size = 0.2)

  X_train, X_test = Scaling(scale_method=scaler, X_train=X_train, X_test=X_test)

  if(do_smote == 'True'): # If true,
      smote = SMOTE(sampling_strategy='auto', random_state=0)
      X_train, y_train = smote.fit_resample(X_train,y_train)

  # 모델 평가 지표
  wandb.define_metric('accuracy', summary='max')
  wandb.define_metric('F1 Score', summary='max')
  wandb.define_metric('Recall', summary='max')
  wandb.define_metric('Precision', summary='max')

  if(model == 'DecisionTreeClassifier'):
    # (Base) Decision Tree Model
    base_model = DecisionTreeClassifier(random_state = 42)

    if(search_method == 'Grid'):
      # Grid Search
      grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=5)
      grid_search.fit(X_train, y_train)

      best_params_grid = grid_search.best_params_
      dt_model_grid = DecisionTreeClassifier(random_state=42, **best_params_grid)
      dt_model_grid.fit(X_train, y_train)
      dt_predictions_grid = dt_model_grid.predict(X_test)
      
      dt_accuracy_grid = accuracy_score(y_test, dt_predictions_grid)
      if(encoder == 'OneHotEncoder'):
        dt_confusion_matrix_grid = confusion_matrix(y_test.argmax(axis=1), dt_predictions_grid.argmax(axis=1))
      else:
        dt_confusion_matrix_grid = confusion_matrix(y_test, dt_predictions_grid)

      dt_f1_score_grid = f1_score(y_test, dt_predictions_grid, average='macro')
      dt_recall_grid = recall_score(y_test, dt_predictions_grid, average='macro')
      dt_precision_grid = precision_score(y_test, dt_predictions_grid, average='macro')

      print("Decision Tree Accuracy with Grid Search:", dt_accuracy_grid)
      print("Decision Tree Confusion Matrix with Grid Search:\n", dt_confusion_matrix_grid)
      print("Decision Tree F1 Score with Grid Search:", dt_f1_score_grid)
      print("Decision Tree Recall with Grid Search:", dt_recall_grid)
      print("Decision Tree Precision with Grid Search:", dt_precision_grid)

      wandb.log({'accuracy': dt_accuracy_grid})
      wandb.log({'Confusion Matrix': dt_confusion_matrix_grid.tolist()})
      wandb.log({'F1 Score': dt_f1_score_grid})
      wandb.log({'Recall': dt_recall_grid})
      wandb.log({'Precision': dt_precision_grid})
        

      # K-Fold 
      kf = KFold(n_splits=5, shuffle=True, random_state=42)

      dt_model_cv = DecisionTreeClassifier(random_state=42, **best_params_grid)
      cv_scores = cross_val_score(dt_model_cv, X_train, y_train, cv=kf, scoring='accuracy')
      cv_accuracy = np.mean(cv_scores)
      print("acc with K-Fold Cross Validation:", cv_accuracy)
      wandb.log({'Accuracy with K-Fold': cv_accuracy})  

      


    if(search_method == 'Random Search'):
      # Random Search
      random_search = RandomizedSearchCV(estimator=base_model, param_distributions=param_dist, cv=5)
      random_search.fit(X_train, y_train)

      best_params_random = random_search.best_params_
      dt_model_random = DecisionTreeClassifier(random_state=42, **best_params_random)
      dt_model_random.fit(X_train, y_train)
      dt_predictions_random = dt_model_random.predict(X_test)
      dt_accuracy_random = accuracy_score(y_test, dt_predictions_random)

      if(encoder == 'OneHotEncoder'):
            dt_confusion_matrix_random = confusion_matrix(y_test.argmax(axis=1), dt_predictions_random.argmax(axis=1))
      else:
            dt_confusion_matrix_random = confusion_matrix(y_test, dt_predictions_random)

      dt_f1_score_random = f1_score(y_test, dt_predictions_random, average='macro')
      dt_recall_random = recall_score(y_test, dt_predictions_random, average='macro')
      dt_precision_random = precision_score(y_test, dt_predictions_random, average='macro')
      print("Decision Tree Accuracy with Random Search:", dt_accuracy_random)
      print("Decision Tree Confusion Matrix with Random Search:\n", dt_confusion_matrix_random)
      print("Decision Tree F1 Score with Random Search:", dt_f1_score_random)
      print("Decision Tree Recall with Random Search:", dt_recall_random)
      print("Decision Tree Precision with Random Search:", dt_precision_random)

      wandb.log({'accuracy': dt_accuracy_random})
      wandb.log({'Confusion Matrix': dt_confusion_matrix_random.tolist()})
      wandb.log({'F1 Score': dt_f1_score_random})
      wandb.log({'Recall': dt_recall_random})
      wandb.log({'Precision': dt_precision_random})

    
      # K-Fold 
      kf = KFold(n_splits=5, shuffle=True, random_state=42)

      dt_model_cv = DecisionTreeClassifier(random_state=42, **best_params_random)
      cv_scores = cross_val_score(dt_model_cv, X_train, y_train, cv=kf, scoring='accuracy')
      cv_accuracy = np.mean(cv_scores)
      print("acc with K-Fold Cross Validation:", cv_accuracy)
      wandb.log({'Accuracy with K-Fold': cv_accuracy})  


{'encoder': 'LabelEncoder', 'scaler': 'StandardScaler', 'test_size': 0.3, 'do_smote': 'True', 'model': 'DecisionTreeClassifier', 'search_method': 'Base'}


  y = column_or_1d(y, warn=True)


{'encoder': 'LabelEncoder', 'scaler': 'StandardScaler', 'test_size': 0.3, 'do_smote': 'True', 'model': 'DecisionTreeClassifier', 'search_method': 'Grid'}


  y = column_or_1d(y, warn=True)


Decision Tree Accuracy with Grid Search: 0.14687882496940025
Decision Tree Confusion Matrix with Grid Search:
 [[ 6  2  1  1  4  3  2  0  0  0  3  5  3  6  8  3]
 [ 7  6  3  3  2  6  3  4  2  1  4  2  2  3  2  2]
 [ 4  3  7  3  2  5  5  5  4  1  4  3  2  3  1  4]
 [ 2  3  4 15  1  3  6  7  3  2  2  3  1  2  2  6]
 [ 2  1  0  1  0  2  1  2  0  0  0  3  2  1  2  1]
 [ 4  4  3  0  4 11  0  1  2  1  4  2  2  2  2  6]
 [ 2  3  2  2  1  1  1  4  3  0  3  0  2  2  0  1]
 [ 4  7  6  9  5  7  3 10  1  3  3  2  1  0  1  6]
 [ 4  2  1  1  2  2  0  1 17 13  7  5  3  1  4  2]
 [ 6  1  0  0  5  3  1  0  8  7  2  8  5  3  2  7]
 [ 5  4  7  2  1  1  1  3  7  1 10  2  0  2  5  8]
 [ 5  2  4  3  2  4  0  2  5  4  7 12  3  4  4  4]
 [ 6  2  0  1  3  1  3  1  4  8  4  3  4  1  7  0]
 [ 6  4  6  2  1  3  3  4  4  3  2  3  4  5  1  2]
 [ 2  1  2  2  3  0  0  2  4  1  3  7  5  2  3  1]
 [ 4  0  9  5  1  1  4  4  3  0  4  1  1  5  5  6]]
Decision Tree F1 Score with Grid Search: 0.13520835522014063
Decision Tr

0,1
Accuracy with K-Fold,▁
F1 Score,▁
Precision,▁
Recall,▁
accuracy,▁

0,1
Accuracy with K-Fold,0.24897


{'encoder': 'LabelEncoder', 'scaler': 'StandardScaler', 'test_size': 0.3, 'do_smote': 'True', 'model': 'DecisionTreeClassifier', 'search_method': 'Random'}


  y = column_or_1d(y, warn=True)


{'encoder': 'LabelEncoder', 'scaler': 'StandardScaler', 'test_size': 0.3, 'do_smote': 'False', 'model': 'DecisionTreeClassifier', 'search_method': 'Base'}


  y = column_or_1d(y, warn=True)


{'encoder': 'LabelEncoder', 'scaler': 'StandardScaler', 'test_size': 0.3, 'do_smote': 'False', 'model': 'DecisionTreeClassifier', 'search_method': 'Grid'}


  y = column_or_1d(y, warn=True)


In [None]:
wandb.finish()