# 빅쿼리 퍼블릭 데이터 셋을 이용해서 내 질문에 따른 Stackoverflow 응답시간 예측하기  

### Fetch Data from bigquery

In [1]:
# Imports the Google Cloud client library
from google.cloud import bigquery

# Instantiates a client
bigquery_client = bigquery.Client(project='hackertone-216701')


In [2]:
# Adjust WHERE clause based on trainning volume, avg_ans_sec <= 7200 took 5min 
QUERY = """
    SELECT title,avg_ans_sec/60 as avg_ans_min FROM `stackoverflow_summary.post_accepted_answers` WHERE avg_ans_sec <= 12*3600 """

# Or, querying dataset based on input words
# QUERY = """
#     SELECT title,avg_ans_sec FROM `stackoverflow_summary.post_accepted_answers` WHERE lower(title) like '%javascript%' """


In [3]:
import pandas as pd
from pandas import read_gbq as gbq

project_id = 'hackertone-216701'
post_accepted_answers = gbq(QUERY,project_id=project_id)

Requesting query... ok.
Job ID: d831d765-d0ca-4dd3-bdc8-5db4f5f57f61
Query running...
  Elapsed 7.17 s. Waiting...
  Elapsed 8.32 s. Waiting...
  Elapsed 9.46 s. Waiting...
  Elapsed 10.6 s. Waiting...
  Elapsed 11.73 s. Waiting...
  Elapsed 12.87 s. Waiting...
  Elapsed 14.1 s. Waiting...
  Elapsed 15.22 s. Waiting...
  Elapsed 16.35 s. Waiting...
  Elapsed 17.5 s. Waiting...
  Elapsed 18.67 s. Waiting...
  Elapsed 19.8 s. Waiting...
  Elapsed 20.95 s. Waiting...
  Elapsed 22.1 s. Waiting...
  Elapsed 23.24 s. Waiting...
  Elapsed 24.38 s. Waiting...
  Elapsed 25.51 s. Waiting...
  Elapsed 26.66 s. Waiting...
  Elapsed 27.78 s. Waiting...
  Elapsed 28.91 s. Waiting...
  Elapsed 30.03 s. Waiting...
  Elapsed 31.14 s. Waiting...
  Elapsed 32.28 s. Waiting...
  Elapsed 33.42 s. Waiting...
  Elapsed 34.6 s. Waiting...
  Elapsed 35.73 s. Waiting...
  Elapsed 36.87 s. Waiting...
  Elapsed 38.01 s. Waiting...
  Elapsed 39.15 s. Waiting...
  Elapsed 40.28 s. Waiting...
  Elapsed 41.42 s. Wait

In [5]:
# Normalize or Scale avg_ans_sec for performance improvement 
# If scale the avg_ans_sec, not easy to reverse values to original, if we reduce avg_ans_sec under 10 thousands prediction is not bad 
# from sklearn.preprocessing import scale

# post_accepted_answers['avg_ans_sec_norm'] = scale(post_accepted_answers.avg_ans_sec.values)
# post_accepted_answers.dtypes


### Split into Training and Test Set

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into train and test set 
X_train , X_test, y_train, y_test = train_test_split(post_accepted_answers.title, post_accepted_answers.avg_ans_min, test_size=0.30, random_state=42) 

In [5]:
# Vectorize title to words
from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer(lowercase=True, min_df=5)
# X_train_counts = count_vect.fit_transform(X_train)
# X_train_counts.shape

In [6]:
# Weighting title based on TF-IDF 
from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [7]:
## Use Pipeline for less code 
from sklearn.pipeline import Pipeline
import numpy as np

### SGD Regression

In [8]:
## Use Supper Vector Machines Algorithm
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
import warnings
warnings.filterwarnings('ignore')

# Can not implement classification cause for not unique feature
# text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge',penalty='l2',random_state=42))])

text_reg_svm = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-svm', SGDRegressor(penalty='l2',random_state=42,alpha=1e-3))])
sgd_model = text_reg_svm.fit(X_train, y_train)
predicted_svm = text_reg_svm.predict(X_test)


In [10]:
# Cacluate Average Error Rate in Seconds
print("predict diff average:{:5f}\npredict diff variance:{:7f}\n".format(np.mean(abs(predicted_svm - y_test)), np.var(abs(predicted_svm - y_test))))

predict diff average:75.7231713091031
predict diff variance:9055.107131668421



### Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

text_dt_reg = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-svm', DecisionTreeRegressor(random_state=42))])
dt_model = text_dt_reg.fit(X_train, y_train)
predicted_dt = text_dt_reg.predict(X_test)


In [None]:
# Cacluate Average Error Rate in Seconds
print("predict diff average:{:5f}\npredict diff variance:{:7f}\n".format(np.mean(abs(predicted_dt - y_test)), np.var(abs(predicted_dt - y_test))))

### Save Model with Pickle

In [44]:
import pickle
filename = 'model.pkl'
pickle.dump(dt_model, open(filename,'wb'))


### Parameter Tunning

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
#               'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__min_df':[5,20],
#               'tfidf__use_idf': (True, False),
              'reg-svm__alpha': (1e-2, 1e-3),
             }

gs_clf = GridSearchCV(text_reg_svm, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
gs_clf.best_score_    
gs_clf.best_params_

### Real World Testing

In [45]:
# Real world testing , stackoverflow queries after dataset being created.
test_query = {'title':['JavaScript why this mod operation return undefined [on hold]',
                       'BigQuery authorization',
                       'Replace function not working for long query in ORACLE/PLSQL?'],
              'response':[6,240,40]}
df = pd.DataFrame(data=test_query)
test_predict = text_reg_svm.predict(df.title)

i=0
for _,row in df.iterrows():
  print("{:70s} | real response:{:10.3f} | predict :{:10.3f} | diff :{:10.3f} \n".format(row.title, row.response,test_predict[i],abs(test_predict[i] - row.response)))
  i=i+1

JavaScript why this mod operation return undefined [on hold]           | real response:     6.000 | predict :    51.536 | diff :    45.536 

BigQuery authorization                                                 | real response:   240.000 | predict :    93.116 | diff :   146.884 

Replace function not working for long query in ORACLE/PLSQL?           | real response:    40.000 | predict :    56.120 | diff :    16.120 

