# Generate Docs

## Imports and definitions

In [1]:
import statistics
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from utils import create_folder, cleanup_folder, human_to_kebab_case
from comment_classifier.utils import preprocess_comment
from comment_classifier.sentence_scorer import SentenceScorer
from doc_page import TagDocPage, PostQuestionDocPage
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabriel.dutradias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gabriel.dutradias/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
output_dir = 'output'
n_questions_per_tag = 10
redundant_tags = ['android']

## Create DataFrames

In [3]:
comments_df = pd.read_csv('data/comments.csv')
posts_questions_df = pd.read_csv('data/posts_questions.csv').drop_duplicates(subset=['id'])
posts_answers_df = pd.read_csv('data/posts_answers.csv')
posts_tag_wiki_df = pd.read_csv('data/posts_tag_wiki.csv')
posts_tag_wiki_excerpt_df = pd.read_csv('data/posts_tag_wiki_excerpt.csv')
selected_tags_df = pd.read_csv('data/selected_tags.csv')
tags_df = pd.read_csv('data/tags.csv')
users_df = pd.read_csv('data/users.csv')

## Development

In [4]:
preprocess_comment('''Still debating whether to add a default image or not. Really wish Google would specify this in their AMP guides, since the Webmaster Tools will continue treating these missing images as "issues" of type "Info: Invalid structured data element" forever, clogging up my review of actual errors in that space. Frustrating for old posts (that I will never go back and fix) and new posts without any meaningful image?''')

['still debat whether add default imag',
 'realli wish googl would specifi amp guid sinc webmast tool continu treat miss imag issu type info invalid structur data element forev clog review actual error space',
 'frustrat old post never go back fix new post without meaning imag ?']

In [5]:
preprocess_comment("seem perform action belong menu item <CODE> caus menu item appear select?")

['seem perform action belong menu item <CODE> cau menu item appear select ?']

In [6]:
comments_df.sample(5)

Unnamed: 0,id,text,creation_date,post_id,user_id,user_display_name,score
717,55490286,"""Dont worry about sending app to background in...",2015-11-23T02:44:38.827Z,1982992,2553372.0,,0
11498,72340349,Better to use activity flags than put things i...,2017-03-05T06:17:08.257Z,42111344,4508063.0,,2
4874,26049298,This is excellent !! I wanted to launch as man...,2013-07-24T21:06:40.820Z,11038348,757243.0,,0
5382,42463224,"Manuel, We have winning answer!",2014-11-16T22:06:53.480Z,21500570,1180310.0,,0
4625,120726616,For Accessibility is there a way to announce t...,2021-07-08T22:13:42.663Z,31254146,5442882.0,,0


### Evaluation functions

In [7]:
# pre-compute important values for evaluation

# add user reputation to post answers
posts_answers_with_user_df = pd.merge(left=posts_answers_df, right=users_df[['reputation', 'id']].add_prefix('user_'), left_on='owner_user_id', right_on='user_id')

# scale values
scaler = MinMaxScaler()
posts_questions_df[['scaled_view_count', 'scaled_score']] = scaler.fit_transform(posts_questions_df[['view_count', 'score']])
posts_answers_with_user_df[['scaled_score', 'scaled_user_reputation']] = scaler.fit_transform(posts_answers_with_user_df[['score', 'user_reputation']])

In [8]:
def eval_answer(answer_row):
    return statistics.mean([answer_row.scaled_score, answer_row.scaled_user_reputation])

posts_answers_with_user_df['eval'] = posts_answers_with_user_df.apply(eval_answer, axis=1)

def eval_question(question_row):
    answer_row = posts_answers_with_user_df.loc[posts_answers_with_user_df.parent_id == question_row.id].sort_values(by='eval', ascending=False).iloc[0]
    return statistics.mean([question_row.scaled_view_count, question_row.scaled_score, answer_row.scaled_score, answer_row.scaled_user_reputation])

posts_questions_df['eval'] = posts_questions_df.apply(eval_question, axis=1)

### Prepare files and variables

In [9]:
cleanup_folder(output_dir)
questions_for_docs = []

### Generate docs for tags

Docs for tags are based on the `posts_tag_wiki` table from Stackoverflow. Content should be a long description for what each tag represents as well as pointing out relevant complementary docs.

In [10]:
for idx, row in selected_tags_df.iterrows():
    tag_id = row.id
    tag_name = row.tag_name
    tag_description = posts_tag_wiki_excerpt_df.loc[posts_tag_wiki_excerpt_df.id == row.excerpt_post_id]['body'].values[0]
    tag_wiki_body = posts_tag_wiki_df.loc[posts_tag_wiki_df.id == row.wiki_post_id]['body'].values[0]
    # save tag doc page
    TagDocPage(tag_name, tag_description, tag_wiki_body).save(f"{output_dir}/{tag_name}.md")

    # prepare post questions for tag
    create_folder(f"{output_dir}/{tag_name}")
    tag_questions_df = posts_questions_df[posts_questions_df.tag_id == tag_id].copy()
    selected_questions_df = tag_questions_df.sort_values(by='eval', ascending=False).head(n_questions_per_tag)
    questions_for_docs.append(selected_questions_df)

### Generate docs for questions

In [11]:
questions_for_docs_df = pd.concat(questions_for_docs)

for idx, row in questions_for_docs_df.iterrows():
    parent_tag_name = row.tag_name
    question_title = row.title
    question_body = row.body
    question_tags = [tag for tag in row.tags.split('|') if tag not in redundant_tags and tag != row.tag_name]
    
    # select answer
    question_answers_df = posts_answers_with_user_df[posts_answers_with_user_df.parent_id == row.id].copy()
    selected_answer = question_answers_df.sort_values(by='eval', ascending=False).iloc[0]
    comments = list(comments_df[comments_df.post_id == selected_answer.id].sort_values(by='score', ascending=False)['text'])
    answer_body = selected_answer.body

    # save post question doc page
    output_file_name = human_to_kebab_case(question_title)
    PostQuestionDocPage(question_title, question_body, question_tags, answer_body, comments).save(f"{output_dir}/{parent_tag_name}/{output_file_name}.md")