In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import joblib

import sys
sys.path.append("..")
import warnings
warnings.filterwarnings('ignore')

from ml_editor.data_processing import (
    format_raw_df, get_split_by_author, 
    add_text_features_to_df, 
    get_vectorized_series, 
    get_feature_vector_and_label
)
from ml_editor.model_evaluation import get_top_k

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

In [2]:
df = add_text_features_to_df(df.loc[df["is_question"]].copy())
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [3]:
model_path = Path("../models/model_1.pkl")
clf = joblib.load(model_path) 
vectorizer_path = Path("../models/vectorizer_1.pkl")
vectorizer = joblib.load(vectorizer_path) 

In [4]:
train_df["vectors"] = get_vectorized_series(train_df["full_text"].copy(), vectorizer)
test_df["vectors"] = get_vectorized_series(test_df["full_text"].copy(), vectorizer)

features = [
                "action_verb_full",
                "question_mark_full",
                "text_len",
                "language_question",
            ]
X_train, y_train = get_feature_vector_and_label(train_df, features)
X_test, y_test = get_feature_vector_and_label(test_df, features)

In [5]:
test_analysis_df = test_df.copy()
y_predicted_proba = clf.predict_proba(X_test)
test_analysis_df["predicted_proba"] = y_predicted_proba[:, 1]
test_analysis_df["true_label"] = y_test

to_display = [
    "predicted_proba",
    "true_label",
    "Title",
    "body_text",
    "text_len",
    "action_verb_full",
    "question_mark_full",
    "language_question",
]
threshold = 0.5


top_pos, top_neg, worst_pos, worst_neg, unsure = get_top_k(test_analysis_df, "predicted_proba", "true_label", k=2)
pd.options.display.max_colwidth = 500

In [6]:
top_pos[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
28875,0.73,True,Is it possible to work with characters that you dislike?,"I'm currently developing a minor character that will appear as the ""shadow"" (Hero's Journey slang for an antagonist that has the potential to destroy the hero) of two other characters. In the eyes of pretty much every other character in the story, he's insensitive and, quite frankly, bordering on stupid. \nHowever, I find that I actually like him(*). He's not someone I would be likely to spend much time with, but it's fun working with him, because he's uncomplicated and doesn't mean to hurt ...",3001,True,True,False
29669,0.73,True,Linguistic simplicity in novels: how do different world markets/languages view it?,"This site has a lot of question on how to write properly and, naturally, many of the answers are on how to do it well in English. Although, obviously, if you can write well in English, you can write well in other languages since it's a simple matter of applying the general idea to other languages.\nEvery now and then, though, some questions-answers are very specific to anglophones. One of these is the idea of linguistic simplicity.\nLet me clarify from the start: I'm Portuguese and, obviousl...",3332,True,True,False


In [7]:
top_neg[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7488,0.11,False,Releases needed for picture books?,Do you need location releases for national parks and model releases for Pets to use in picture books?\n,137,False,True,False
29504,0.11,False,Is this a valid haiku?,A friend showed me a website for an artist. There is a page filled with haikus he's written. Can someone verify the validity of the following haiku:\n\ni don’t really\nwant to do\nthat\n\n,205,False,True,False


In [8]:
worst_pos[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14157,0.12,True,Do you bold punctuation directly after bold text?,"Do you bold punctuation directly after bold, linked or italic text? \n",119,False,True,False
40406,0.15,True,Should the name of a mythological creature be capitalized?,"The name of the mythological creature in my novel is ""manananggal"". When I'm referring to it, should I capitalize what it is? \n",186,True,True,True


In [9]:
worst_neg[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7878,0.77,False,"When quoting a person's informal speech, how much liberty do you have to make changes to what they say?","Even during a formal interview for a news article, people speak informally. They say ""uhm"", they cut off sentences half-way through, they interject phrases like ""you know?"", and they make innocent grammatical mistakes.\nAs somebody who wants to fairly and accurately report the discussion that takes place in an interview, what guidelines should I use in making changes to what a person says?\nWhile the simplest solution is to write exactly what they say and [sic] any errors they make, that can...",694,True,True,False
25032,0.76,False,Splicing/Mixing Scenes,"Remember the end of the Godfather, where they spliced all the different assassinations with the christening? Or any film with good cutting, where they swap between different characters and what they're doing? That can work great with books, too. But I want some advice on how to do it properly.\nI have several characters going through the same event, but from different places and perspectives. The question is the best way of writing the experience and presenting it.\n\nShould I write out each...",1675,True,True,False
