In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib
import sys
sys.path.append("..")
np.random.seed(35)
import warnings
warnings.filterwarnings('ignore')

from ml_editor.data_processing import (
    format_raw_df,
    add_text_features_to_df,
    get_feature_vector_and_label,
    get_split_by_author,
    get_vectorized_inputs_and_label,
    get_vectorized_series,
    train_vectorizer,
)


data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

df = df.loc[df["is_question"]].copy()

In [18]:
df = add_text_features_to_df(df.copy())
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

vectorizer = train_vectorizer(train_df)
train_df["vectors"] = get_vectorized_series(train_df["full_text"].copy(), vectorizer)
test_df["vectors"] = get_vectorized_series(test_df["full_text"].copy(), vectorizer)

In [20]:
features = [
                "action_verb_full",
                "question_mark_full",
                "text_len",
                "language_question",
            ]
X_train, y_train = get_feature_vector_and_label(train_df, features)
X_test, y_test = get_feature_vector_and_label(test_df, features)

In [22]:
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

array([ True,  True, False, ...,  True,  True,  True])

In [31]:
def get_metrics(y_test, y_predicted):  
    # 진짜 양성 / (진짜 양성 + 가짜 양성)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # 진짜 양성 / (진짜 양성 + 가짜 음성)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # 정밀도와 재현율의 조화 평균
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # 진짜 양성 + 진짜 음성 / 전체
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [32]:
y_train_pred = np.argmax(clf.oob_decision_function_,axis=1)

accuracy, precision, recall, f1 = get_metrics(y_train, y_train_pred)
print("훈련 정확도 = %.3f, 정밀도 = %.3f, 재현율 = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

훈련 정확도 = 0.588, 정밀도 = 0.585, 재현율 = 0.588, f1 = 0.583


In [34]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("검증 정확도 = %.3f, 정밀도 = %.3f, 재현율 = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

검증 정확도 = 0.610, 정밀도 = 0.611, 재현율 = 0.610, f1 = 0.606


In [36]:
model_path = Path("../models/model_1.pkl")
vectorizer_path = Path("../models/vectorizer_1.pkl")
joblib.dump(clf, model_path) 
joblib.dump(vectorizer, vectorizer_path) 

['..\\models\\vectorizer_1.pkl']

In [37]:
from ml_editor.model_v1 import get_model_probabilities_for_input_texts


# The inference function expects an array of questions, so we created an array of length 1 to pass a single question
test_q = ["bad question"]
probs = get_model_probabilities_for_input_texts(test_q)

# Index 1 corresponds to the positive class here
print("이 질문이 양성 샘플일 확률: %s" % (probs[0][1]))

이 질문이 양성 샘플일 확률: 0.28
