# Bigquery 집계결과를 가지고 Stackoverflow 응답시간 예측하기
<br> 
## Bigquery 로 응답이 채택된 550만 질문건에 대해서, 질문자가 입력한 tag 기반으로 예상 응답시간 분석    

In [20]:
# Imports the Google Cloud client library
from google.cloud import bigquery

def query_stackoverflow(project,title):
  
    client = bigquery.Client()
    
    # Query percentile of response based on tag
    query = """
            WITH
              w_tag AS(
              SELECT
                split_tags,
                avg_ans_sec
              FROM
                `hackertone-216701.stackoverflow_summary.post_accepted_answers`
              WHERE avg_ans_sec <=2*3600)
            SELECT
              tag,
              arr_ans_sec[SAFE_OFFSET(CAST(array_length(arr_ans_sec)*1/4 AS INT64))] as q1,
              arr_ans_sec[SAFE_OFFSET(CAST(array_length(arr_ans_sec)*2/4 AS INT64))] as q2,
              arr_ans_sec[SAFE_OFFSET(CAST(array_length(arr_ans_sec)*3/4 AS INT64))] as q3,
              average,
              array_length(arr_ans_sec) as count,
              array_length(arr_ans_sec)/sum(array_length(arr_ans_sec)) over() percent
            FROM
            (
            SELECT
              tag,
              ARRAY_AGG(avg_ans_sec ORDER BY avg_ans_sec asc) AS arr_ans_sec,
              avg(avg_ans_sec) as average
            FROM
              w_tag,
              UNNEST(split_tags) AS tag
              WHERE tag in UNNEST(@title) -- Parameterized Query 
            GROUP BY
              tag
            )
            ORDER BY count desc, q2
            --LIMIT 10 
            """
    
    splited_title = title.lower().split()
    query_params = [
       bigquery.ArrayQueryParameter(
            'title', 'STRING', splited_title)
    ]

    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(
        query,
        # Location must match that of the dataset(s) referenced in the query.
        location='US',
        job_config=job_config)  # API request - starts the query

    results = query_job.result()  # Waits for job to complete.
    print("Expected response time upon your question: \n{} \n".format(splited_title))
    for row in results:
        print("tag:{:15s} | 25%:{:7.0f} 분 | Median:{:7.0f} 분 | Average:{:7.0f} 분".format(row.tag, row.q1/60, row.q2/60, row.average/60))
        
    assert query_job.state == 'DONE'


In [21]:
project='hackertone-216701'

# Stackoverflow에 질문할 내용
query = 'Why does Java not allow multiple inheritance but does allow conforming to multiple interfaces with default implementations'

query_stackoverflow(project,query)


Expected response time upon your question: 
['why', 'does', 'java', 'not', 'allow', 'multiple', 'inheritance', 'but', 'does', 'allow', 'conforming', 'to', 'multiple', 'interfaces', 'with', 'default', 'implementations'] 

tag:java            | 25%:      5 분 | Median:     12 분 | Average:     22 분
tag:inheritance     | 25%:      5 분 | Median:     10 분 | Average:     19 분
tag:default         | 25%:      5 분 | Median:     11 분 | Average:     21 분


# Bigquery 와 ML 엔진을 이용해서 Stackoverflow 응답시간 예측 하기

### Bigquery 로 12시간안에 대답으로 질문 내용 580만건에 대해서 ML 알고리즘을 이용해서 학습 후, 해당 학습 모델을 ML엔진에 업로드 
<br> 
### 입력된 질문에 대해서 ML엔진이 모델분석을 통해서 예상되는 응답시간을 출력

In [3]:
import googleapiclient.discovery

def predict_json(project, model, instances, version=None):
    """Send json data to a deployed model for prediction.
    Args:
        project (str): project where the Cloud ML Engine Model is deployed.
        model (str): model name.
        instances ([[float]]): List of input instances, where each input
           instance is a list of floats.
        version: str, version of the model to target.
    Returns:
        Mapping[str: any]: dictionary of prediction results defined by the
            model.
    """
    # Create the ML Engine service object.
    # To authenticate set the environment variable
    # GOOGLE_APPLICATION_CREDENTIALS=<path_to_service_account_file>
    service = googleapiclient.discovery.build('ml', 'v1')
    name = 'projects/{}/models/{}'.format(project, model)

    if version is not None:
        name += '/versions/{}'.format(version)

    response = service.projects().predict(
        name=name,
        body={'instances': instances}
    ).execute()

    if 'error' in response:
        raise RuntimeError(response['error'])

    return response['predictions']

In [17]:
import pandas as pd
#ML 엔진 모델 이름
model = 'stackoverflow'
#ML 엔진 모델 버젼
versions='SGD_Regression_2hour'
#versions='RandomForest_Regression'


# 질문 쿼리
mlquery=[query]

# ML 엔진에 질문쿼리를 입력해서 결과값을 받음
result = predict_json(project,model,mlquery,versions)

print("예상되는 답변 시간: {0:.1f} 분".format(result[0]))


예상되는 답변 시간: 22.0 분
