In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import csv
import optuna

In [2]:
# 셀에서 출력 정확도 설정.
%precision %.3f

'%.3f'

In [3]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])

In [4]:
# 전문가 평점 1~5점 각 15개씩 랜덤으로 가져와서 점수 라벨링
# 회귀 예측용 label
labels = []
for i in range(317):
    if i < 22:
        labels.append(5)
    elif i < 77:
        labels.append(4)
    elif i < 175:
        labels.append(3)
    elif i < 302:
        labels.append(2)        
    else:
        labels.append(1)

In [5]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(documents_corpus, labels, test_size=0.3, random_state=123)

In [7]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(X_train)]

In [8]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Randomforest 하이퍼파라미터 튜닝
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)

    # Randomforest 모델 생성
    model = RandomForestRegressor(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features,
                                    random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = r2_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 20:53:32,812] A new study created in memory with name: no-name-122365d2-afda-4cce-8a3b-fde2e3181d59
[I 2024-06-20 20:54:06,120] Trial 3 finished with value: -0.06001411322510686 and parameters: {'vector_size': 200, 'window': 9, 'min_count': 13, 'epochs': 45, 'dm': 0, 'n_estimators': 204, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 0.6422171254986015}. Best is trial 3 with value: -0.06001411322510686.
[I 2024-06-20 20:54:07,122] Trial 5 finished with value: 0.02161739284404085 and parameters: {'vector_size': 190, 'window': 5, 'min_count': 18, 'epochs': 50, 'dm': 0, 'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.6952821485269979}. Best is trial 5 with value: 0.02161739284404085.
[I 2024-06-20 20:54:07,878] Trial 4 finished with value: 0.03363951484984262 and parameters: {'vector_size': 230, 'window': 7, 'min_count': 16, 'epochs': 40, 'dm': 0, 'n_estimators': 406, 'max_depth': 20, 'mi

Best hyperparameters:  {'vector_size': 270, 'window': 10, 'min_count': 7, 'epochs': 25, 'dm': 1, 'n_estimators': 295, 'max_depth': 23, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.9740256401146734}


In [9]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# RandomForest 모델 생성
rg = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                    max_depth=best_params['max_depth'],
                                    min_samples_split=best_params['min_samples_split'],
                                    min_samples_leaf=best_params['min_samples_leaf'],
                                    max_features=best_params['max_features'],
                                    random_state=123)

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# RandomForestRegressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# RandomForestRegressor
0.19711461354956705
0.9045541814034423


In [10]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # SVM 하이퍼파라미터 튜닝
    C = trial.suggest_float('C', 0.01, 100.0, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    gamma = trial.suggest_float('gamma', 1e-4, 1.0, log=True) if kernel in ['rbf', 'poly'] else 'scale'

    # SVM 모델 생성
    model = SVR(
        C=C,
        kernel=kernel,
        degree=degree,
        gamma=gamma
    )

    model.fit(X_train_vec, y_train)
    score = r2_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 21:03:22,213] A new study created in memory with name: no-name-7a332152-915b-4cbb-9b7e-9e442560a3f6
[I 2024-06-20 21:03:44,554] Trial 0 finished with value: -5.103637935355051 and parameters: {'vector_size': 100, 'window': 12, 'min_count': 15, 'epochs': 35, 'dm': 0, 'C': 28.711812075532926, 'kernel': 'sigmoid'}. Best is trial 0 with value: -5.103637935355051.
[I 2024-06-20 21:03:48,722] Trial 2 finished with value: 0.012311219192557732 and parameters: {'vector_size': 140, 'window': 4, 'min_count': 9, 'epochs': 40, 'dm': 0, 'C': 0.08455495906388692, 'kernel': 'rbf', 'gamma': 0.042093649383791856}. Best is trial 2 with value: 0.012311219192557732.
[I 2024-06-20 21:03:52,833] Trial 7 finished with value: 0.09373428457675492 and parameters: {'vector_size': 250, 'window': 5, 'min_count': 20, 'epochs': 45, 'dm': 0, 'C': 0.01444630017132393, 'kernel': 'linear'}. Best is trial 7 with value: 0.09373428457675492.
[I 2024-06-20 21:03:55,962] Trial 1 finished with value: -0.078195344

Best hyperparameters:  {'vector_size': 60, 'window': 9, 'min_count': 18, 'epochs': 25, 'dm': 1, 'C': 27.29303555986356, 'kernel': 'rbf', 'gamma': 0.00619652788216451}


In [14]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=60, 
                window=9, min_count=18,
                epochs=25, dm=1, seed=123)

# SVM 분류 모델 생성
rg = SVR(C=27.29303555986356, gamma=0.00619652788216451, kernel='rbf')#degree=best_params['degree'], gamma=best_params['gamma'], 

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# SVM Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# SVM Regressor
0.11437047168352532
0.9500224166962041


In [15]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # XGBOOST 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 30)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    gamma = trial.suggest_float('gamma', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    
    # XGboost 모델 생성
    model = XGBRegressor(max_depth=max_depth,
                        n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        gamma=gamma,
                        min_child_weight=min_child_weight,
                        random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = r2_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 21:14:14,091] A new study created in memory with name: no-name-e8e9c440-207c-4368-9db5-bc92e3611803
[I 2024-06-20 21:14:30,967] Trial 4 finished with value: 0.1394156813621521 and parameters: {'vector_size': 60, 'window': 15, 'min_count': 18, 'epochs': 20, 'dm': 0, 'max_depth': 4, 'n_estimators': 481, 'learning_rate': 0.19432061174657045, 'gamma': 0.22143869579166153, 'min_child_weight': 13}. Best is trial 4 with value: 0.1394156813621521.
[I 2024-06-20 21:14:37,121] Trial 5 finished with value: -0.018006324768066406 and parameters: {'vector_size': 90, 'window': 9, 'min_count': 20, 'epochs': 30, 'dm': 0, 'max_depth': 22, 'n_estimators': 388, 'learning_rate': 0.29657094231162556, 'gamma': 0.0064380663155968465, 'min_child_weight': 20}. Best is trial 4 with value: 0.1394156813621521.
[I 2024-06-20 21:14:44,443] Trial 3 finished with value: -0.14780735969543457 and parameters: {'vector_size': 250, 'window': 5, 'min_count': 11, 'epochs': 35, 'dm': 0, 'max_depth': 12, 'n_estim

Best hyperparameters:  {'vector_size': 270, 'window': 14, 'min_count': 8, 'epochs': 40, 'dm': 1, 'max_depth': 19, 'n_estimators': 472, 'learning_rate': 0.0434639160653416, 'gamma': 0.3220174600731115, 'min_child_weight': 13}


In [16]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# XGBoost 모델 생성
rg = XGBRegressor(
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    gamma=best_params['gamma'],
    min_child_weight=best_params['min_child_weight'],
    random_state=123
)

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# XGBoost Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# XGBoost Regressor
0.0727090835571289
0.9721109011825732


In [17]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # LightGBM 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 30)
    num_leaves = trial.suggest_int('num_leaves', 10, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 50)

    # LightGBM 모델 생성
    model = LGBMRegressor(
        max_depth=max_depth,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_child_samples=min_child_samples,
        random_state=123
    )
    
    model.fit(X_train_vec, y_train)
    score = r2_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 21:25:55,218] A new study created in memory with name: no-name-2f3fd9b0-e7ab-4364-9b37-befae6b9ec70
[I 2024-06-20 21:26:13,424] Trial 6 finished with value: 0.13310602938048421 and parameters: {'vector_size': 60, 'window': 9, 'min_count': 15, 'epochs': 20, 'dm': 0, 'max_depth': 18, 'num_leaves': 146, 'learning_rate': 0.2972200915761797, 'n_estimators': 306, 'min_child_samples': 20}. Best is trial 6 with value: 0.13310602938048421.
[I 2024-06-20 21:26:19,638] Trial 5 finished with value: -0.02478202169841448 and parameters: {'vector_size': 50, 'window': 5, 'min_count': 6, 'epochs': 30, 'dm': 0, 'max_depth': 30, 'num_leaves': 11, 'learning_rate': 0.13834138170502616, 'n_estimators': 364, 'min_child_samples': 22}. Best is trial 6 with value: 0.13310602938048421.
[I 2024-06-20 21:26:22,627] Trial 7 finished with value: 0.1027739709905442 and parameters: {'vector_size': 150, 'window': 3, 'min_count': 1, 'epochs': 30, 'dm': 0, 'max_depth': 20, 'num_leaves': 96, 'learning_rate':

Best hyperparameters:  {'vector_size': 150, 'window': 8, 'min_count': 6, 'epochs': 30, 'dm': 1, 'max_depth': 6, 'num_leaves': 151, 'learning_rate': 0.16507384468141148, 'n_estimators': 179, 'min_child_samples': 35}


In [18]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# LightGBM 모델 생성
rg = LGBMRegressor(max_depth=best_params['max_depth'], num_leaves=best_params['num_leaves'], 
                    learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'], 
                    min_child_samples=best_params['min_child_samples'], random_state=123)


# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# LightGBM Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11301
[LightGBM] [Info] Number of data points in the train set: 221, number of used features: 150
[LightGBM] [Info] Start training from score 2.828054
# LightGBM Regressor
0.22561849344028584
0.8883524313442634


In [19]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Linear Regression 하이퍼파라미터 튜닝
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    
    # Linear Regression 모델 생성
    model = LinearRegression(fit_intercept=fit_intercept)
    
    model.fit(X_train_vec, y_train)
    score = r2_score(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 21:36:55,870] A new study created in memory with name: no-name-703f515b-c5fb-479b-85c4-2618a46c1b88
[I 2024-06-20 21:37:15,696] Trial 1 finished with value: 0.1434904932975769 and parameters: {'vector_size': 110, 'window': 7, 'min_count': 12, 'epochs': 20, 'dm': 0, 'fit_intercept': True}. Best is trial 1 with value: 0.1434904932975769.
[I 2024-06-20 21:37:20,516] Trial 4 finished with value: -0.7598494291305542 and parameters: {'vector_size': 150, 'window': 4, 'min_count': 8, 'epochs': 25, 'dm': 0, 'fit_intercept': False}. Best is trial 1 with value: 0.1434904932975769.
[I 2024-06-20 21:37:22,154] Trial 5 finished with value: -1.246936321258545 and parameters: {'vector_size': 240, 'window': 4, 'min_count': 2, 'epochs': 25, 'dm': 0, 'fit_intercept': False}. Best is trial 1 with value: 0.1434904932975769.
[I 2024-06-20 21:37:22,309] Trial 3 finished with value: -1.2384426593780518 and parameters: {'vector_size': 280, 'window': 4, 'min_count': 6, 'epochs': 25, 'dm': 0, 'fit_

Best hyperparameters:  {'vector_size': 100, 'window': 5, 'min_count': 19, 'epochs': 20, 'dm': 0, 'fit_intercept': True}


In [20]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# Linear Regression 모델 생성
rg = LinearRegression(fit_intercept=best_params['fit_intercept'])

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# LinearRegression')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# LinearRegression
0.09882837533950806
0.9583222552830236


In [None]:
# import pickle

# with open('d2v_model_1245.pkl', 'wb') as f:
#     pickle.dump(model, f)
    
# with open('svc_model_1245.pkl', 'wb') as f:
#     pickle.dump(clf, f)