In [28]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import csv
import random
import optuna

In [29]:
# 셀에서 출력 정확도 설정.
%precision %.3f

'%.3f'

In [30]:
file_path = r'.\crawling_data\preprocessed_essay.csv'

# 자소서 리스트화
documents = []
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        documents.append(row[0])
        
documents_5 = documents[:22]
documents_4 = documents[22:77]
documents_3 = documents[77:175]
documents_2 = documents[175:302]
documents_1 = documents[302:]

documents = random.sample(documents_5, 15) + random.sample(documents_4, 15) + random.sample(documents_3, 15) + random.sample(documents_2, 15) + documents_1

In [31]:
# 전문가 평점 1~5점 각 15개씩 랜덤으로 가져와서 점수 라벨링
# 회귀 예측용 label
labels = []
for i in range(75):
    if i < 15:
        labels.append(5)
    elif i < 30:
        labels.append(4)
    elif i < 45:
        labels.append(3)
    elif i < 60:
        labels.append(2)        
    else:
        labels.append(1)

In [32]:
# 불용어 리스트 가져오기
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

documents_corpus = []
for document in documents:
    documents_corpus.append(preprocess(document))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(documents_corpus, labels, test_size=0.3, random_state=123)

In [34]:
# Tagged Corpus 생성.
# 모델 학습에 필요한 형태로 변환.
my_tagged_corpus = [TaggedDocument(words=my_words, tags=[i]) for i, my_words in enumerate(X_train)]

In [35]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Randomforest 하이퍼파라미터 튜닝
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)

    # Randomforest 모델 생성
    model = RandomForestRegressor(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features,
                                    random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = root_mean_squared_error(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 19:53:00,466] A new study created in memory with name: no-name-448f8056-06d2-40fc-95ca-9a6c2fb15b98
[I 2024-06-20 19:53:04,338] Trial 7 finished with value: 1.4991869186549283 and parameters: {'vector_size': 280, 'window': 10, 'min_count': 15, 'epochs': 20, 'dm': 0, 'n_estimators': 129, 'max_depth': 16, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 0.16225213040289166}. Best is trial 7 with value: 1.4991869186549283.
[I 2024-06-20 19:53:07,763] Trial 4 finished with value: 1.4601946971655724 and parameters: {'vector_size': 100, 'window': 12, 'min_count': 8, 'epochs': 35, 'dm': 0, 'n_estimators': 395, 'max_depth': 24, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.9812317850257065}. Best is trial 4 with value: 1.4601946971655724.
[I 2024-06-20 19:53:10,789] Trial 3 finished with value: 1.639825755342845 and parameters: {'vector_size': 160, 'window': 10, 'min_count': 4, 'epochs': 25, 'dm': 1, 'n_estimators': 197, 'max_depth': 21, 'min_samp

Best hyperparameters:  {'vector_size': 230, 'window': 4, 'min_count': 3, 'epochs': 50, 'dm': 1, 'n_estimators': 493, 'max_depth': 11, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.3985273012149684}


In [36]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# RandomForest 모델 생성
rg = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                    max_depth=best_params['max_depth'],
                                    min_samples_split=best_params['min_samples_split'],
                                    min_samples_leaf=best_params['min_samples_leaf'],
                                    max_features=best_params['max_features'],
                                    random_state=123)

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 분류 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# RandomForestRegressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# RandomForestRegressor
0.19237285601967458
1.3725721963387005


In [37]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # SVM 하이퍼파라미터 튜닝
    C = trial.suggest_float('C', 0.01, 100.0, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    gamma = trial.suggest_float('gamma', 1e-4, 1.0, log=True) if kernel in ['rbf', 'poly'] else 'scale'

    # SVM 모델 생성
    model = SVR(
        C=C,
        kernel=kernel,
        degree=degree,
        gamma=gamma
    )

    model.fit(X_train_vec, y_train)
    score = root_mean_squared_error(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 20:00:41,510] A new study created in memory with name: no-name-1429893f-8536-4ff9-a9a0-5535ff48d1f0
[I 2024-06-20 20:00:46,856] Trial 3 finished with value: 1.6105410440644363 and parameters: {'vector_size': 90, 'window': 14, 'min_count': 12, 'epochs': 30, 'dm': 0, 'C': 0.7768525393196164, 'kernel': 'rbf', 'gamma': 0.0011108134743298755}. Best is trial 3 with value: 1.6105410440644363.
[I 2024-06-20 20:00:47,820] Trial 6 finished with value: 16.888449156541963 and parameters: {'vector_size': 220, 'window': 12, 'min_count': 7, 'epochs': 20, 'dm': 1, 'C': 47.489887661334215, 'kernel': 'sigmoid'}. Best is trial 3 with value: 1.6105410440644363.
[I 2024-06-20 20:00:48,001] Trial 1 finished with value: 1.5201909633261492 and parameters: {'vector_size': 300, 'window': 13, 'min_count': 11, 'epochs': 35, 'dm': 0, 'C': 2.1414434561136337, 'kernel': 'sigmoid'}. Best is trial 1 with value: 1.5201909633261492.
[I 2024-06-20 20:00:49,442] Trial 7 finished with value: 1.607401129418646

Best hyperparameters:  {'vector_size': 140, 'window': 6, 'min_count': 2, 'epochs': 20, 'dm': 0, 'C': 11.136452381332822, 'kernel': 'linear'}


In [39]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# SVM 분류 모델 생성
rg = SVR(C=best_params['C'], kernel=best_params['kernel'])#degree=best_params['degree'], gamma=best_params['gamma'], 

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# SVM Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# SVM Regressor
0.2696959677553564
1.3052136081614567


In [40]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # XGBOOST 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 30)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    gamma = trial.suggest_float('gamma', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    
    # XGboost 모델 생성
    model = XGBRegressor(max_depth=max_depth,
                        n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        gamma=gamma,
                        min_child_weight=min_child_weight,
                        random_state=123)
    
    model.fit(X_train_vec, y_train)
    score = root_mean_squared_error(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 20:02:33,310] A new study created in memory with name: no-name-e71da5e1-e06a-4717-868b-4c1305651911
[I 2024-06-20 20:02:39,185] Trial 2 finished with value: 1.4464724497767174 and parameters: {'vector_size': 70, 'window': 5, 'min_count': 12, 'epochs': 25, 'dm': 0, 'max_depth': 22, 'n_estimators': 228, 'learning_rate': 0.19395170596077085, 'gamma': 0.3551091405347586, 'min_child_weight': 7}. Best is trial 2 with value: 1.4464724497767174.
[I 2024-06-20 20:02:40,129] Trial 3 finished with value: 1.5157060758084335 and parameters: {'vector_size': 110, 'window': 8, 'min_count': 16, 'epochs': 20, 'dm': 1, 'max_depth': 26, 'n_estimators': 427, 'learning_rate': 0.0957674581931816, 'gamma': 0.4430979495656976, 'min_child_weight': 3}. Best is trial 2 with value: 1.4464724497767174.
[I 2024-06-20 20:02:41,011] Trial 5 finished with value: 1.4773480386640858 and parameters: {'vector_size': 60, 'window': 11, 'min_count': 17, 'epochs': 35, 'dm': 0, 'max_depth': 20, 'n_estimators': 120

Best hyperparameters:  {'vector_size': 170, 'window': 11, 'min_count': 3, 'epochs': 20, 'dm': 0, 'max_depth': 22, 'n_estimators': 284, 'learning_rate': 0.18902644413287623, 'gamma': 0.45472731659995647, 'min_child_weight': 16}


In [41]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# XGBoost 모델 생성
rg = XGBRegressor(
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    gamma=best_params['gamma'],
    min_child_weight=best_params['min_child_weight'],
    random_state=123
)

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# XGBoost Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# XGBoost Regressor
0.3201231360435486
1.2593454902044487


In [42]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # LightGBM 하이퍼파라미터 튜닝
    max_depth = trial.suggest_int('max_depth', 2, 30)
    num_leaves = trial.suggest_int('num_leaves', 10, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 50)

    # LightGBM 모델 생성
    model = LGBMRegressor(
        max_depth=max_depth,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_child_samples=min_child_samples,
        random_state=123
    )
    
    model.fit(X_train_vec, y_train)
    score = root_mean_squared_error(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 20:05:12,041] A new study created in memory with name: no-name-2e7b6c93-063c-4869-84b0-101256484bb5
[I 2024-06-20 20:05:15,906] Trial 3 finished with value: 1.651043748427961 and parameters: {'vector_size': 110, 'window': 10, 'min_count': 20, 'epochs': 25, 'dm': 0, 'max_depth': 12, 'num_leaves': 29, 'learning_rate': 0.2516582714241259, 'n_estimators': 57, 'min_child_samples': 35}. Best is trial 3 with value: 1.651043748427961.
[I 2024-06-20 20:05:18,070] Trial 2 finished with value: 1.676988824712578 and parameters: {'vector_size': 210, 'window': 7, 'min_count': 6, 'epochs': 30, 'dm': 0, 'max_depth': 14, 'num_leaves': 178, 'learning_rate': 0.12365623288462503, 'n_estimators': 465, 'min_child_samples': 4}. Best is trial 3 with value: 1.651043748427961.
[I 2024-06-20 20:05:19,882] Trial 5 finished with value: 1.651043748427961 and parameters: {'vector_size': 140, 'window': 15, 'min_count': 16, 'epochs': 35, 'dm': 1, 'max_depth': 17, 'num_leaves': 144, 'learning_rate': 0.295

Best hyperparameters:  {'vector_size': 250, 'window': 12, 'min_count': 12, 'epochs': 40, 'dm': 0, 'max_depth': 25, 'num_leaves': 119, 'learning_rate': 0.06130154809425188, 'n_estimators': 163, 'min_child_samples': 22}


In [43]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# LightGBM 모델 생성
rg = LGBMRegressor(max_depth=best_params['max_depth'], num_leaves=best_params['num_leaves'], 
                    learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'], 
                    min_child_samples=best_params['min_child_samples'], random_state=123)


# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# LightGBM Regressor')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# LightGBM Regressor
0.0020929230804884735
1.5257198449305462


In [44]:
# Optuna 목적 함수
def objective(trial):
    # Doc2Vec 하이퍼파라미터 튜닝
    vector_size = trial.suggest_int('vector_size', 50, 300, step=10)
    window = trial.suggest_int('window', 3, 15)
    min_count = trial.suggest_int('min_count', 1, 20)
    epochs = trial.suggest_int('epochs', 20, 50, step=5)
    dm = trial.suggest_int('dm', 0, 1)

    # Doc2Vec 모델 학습
    model = Doc2Vec(my_tagged_corpus, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=dm, seed=123)
    X_train_vec = [model.infer_vector(doc) for doc in X_train]
    X_test_vec = [model.infer_vector(doc) for doc in X_test]

    # Linear Regression 하이퍼파라미터 튜닝
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    
    # Linear Regression 모델 생성
    model = LinearRegression(fit_intercept=fit_intercept)
    
    model.fit(X_train_vec, y_train)
    score = root_mean_squared_error(y_test, model.predict(X_test_vec))
    
    return score

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)

# 최적 하이퍼파라미터 출력
print('Best hyperparameters: ', study.best_params)

[I 2024-06-20 20:07:26,584] A new study created in memory with name: no-name-4fdf4a77-a2c7-487f-a343-f18fed54397c
[I 2024-06-20 20:07:32,526] Trial 0 finished with value: 1.4218033824424912 and parameters: {'vector_size': 190, 'window': 15, 'min_count': 2, 'epochs': 25, 'dm': 0, 'fit_intercept': True}. Best is trial 0 with value: 1.4218033824424912.
[I 2024-06-20 20:07:33,977] Trial 6 finished with value: 1.352430050387059 and parameters: {'vector_size': 110, 'window': 10, 'min_count': 1, 'epochs': 35, 'dm': 0, 'fit_intercept': True}. Best is trial 6 with value: 1.352430050387059.
[I 2024-06-20 20:07:34,054] Trial 7 finished with value: 1.6612233715665214 and parameters: {'vector_size': 240, 'window': 13, 'min_count': 2, 'epochs': 35, 'dm': 0, 'fit_intercept': False}. Best is trial 6 with value: 1.352430050387059.
[I 2024-06-20 20:07:34,427] Trial 4 finished with value: 1.4289444074519566 and parameters: {'vector_size': 170, 'window': 8, 'min_count': 13, 'epochs': 45, 'dm': 0, 'fit_int

Best hyperparameters:  {'vector_size': 50, 'window': 10, 'min_count': 1, 'epochs': 20, 'dm': 0, 'fit_intercept': True}


In [46]:
best_params = study.best_params

# Doc2Vec 모델 생성
model = Doc2Vec(my_tagged_corpus, vector_size=best_params['vector_size'], 
                window=best_params['window'], min_count=best_params['min_count'],
                epochs=best_params['epochs'], dm=best_params['dm'], seed=123)

# Linear Regression 모델 생성
rg = LinearRegression(fit_intercept=best_params['fit_intercept'])

# Doc2Vec 벡터 생성
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# 회귀 모델 학습 및 예측
rg.fit(X_train_vec, y_train)
y_pred = rg.predict(X_test_vec)

print('# LinearRegression')
print(r2_score(y_test, y_pred))
print(root_mean_squared_error(y_test, y_pred))

# LinearRegression
0.2928844094276428
1.2843249795526097


In [None]:
# import pickle

# with open('d2v_model_1245.pkl', 'wb') as f:
#     pickle.dump(model, f)
    
# with open('svc_model_1245.pkl', 'wb') as f:
#     pickle.dump(clf, f)