# 빅쿼리 퍼블릭 데이터 셋을 이용해서 내 질문에 따른 Stackoverflow 응답시간 예측하기  

### 빅쿼리로 부터 질문에 따른 응답시간 데이터 수신

In [1]:
# Imports the Google Cloud client library
from google.cloud import bigquery

# Instantiates a client
bigquery_client = bigquery.Client(project='hackertone-216701')


In [2]:
# Adjust WHERE clause based on trainning volume, avg_ans_sec <= 7200 took 5min 
QUERY = """
    SELECT title,avg_ans_sec/60 as avg_ans_min FROM `stackoverflow_summary.post_accepted_answers` WHERE avg_ans_sec <= 2*3600 """

In [3]:
import pandas as pd
from pandas import read_gbq as gbq

project_id = 'hackertone-216701'
post_accepted_answers = gbq(QUERY,project_id=project_id)

Requesting query... ok.
Job ID: 7f29fe90-41fa-4418-815c-b2b525ab3444
Query running...
Query done.
Cache hit.

Retrieving results...
Got 5500222 rows.

Total time taken 252.42 s.
Finished at 2018-10-05 12:09:29.


In [4]:
# Normalize or Scale avg_ans_sec for performance improvement 
# If scale the avg_ans_sec, not easy to reverse values to original, if we reduce avg_ans_sec under 10 thousands prediction is not bad 
# from sklearn.preprocessing import scale

# post_accepted_answers['avg_ans_sec_norm'] = scale(post_accepted_answers.avg_ans_sec.values)
# post_accepted_answers.dtypes


### 전체 데이터를 Training Set 과 Test Set 으로 분리

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into train and test set 
X_train , X_test, y_train, y_test = train_test_split(post_accepted_answers.title, post_accepted_answers.avg_ans_min, test_size=0.3, random_state=42) 

In [6]:
# Vectorize title to words
from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer(lowercase=True, min_df=5)
# X_train_counts = count_vect.fit_transform(X_train)
# X_train_counts.shape

In [7]:
# Weighting title based on TF-IDF 
from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [8]:
## Use Pipeline for less code 
from sklearn.pipeline import Pipeline
import numpy as np

In [9]:
def get_error_rate(predict,y_test):
  print("Predicted diff average:{:5f}\npredict diff variance:{:7f}\n".format(np.mean(abs(predict - y_test)), np.var(abs(predict - y_test))))

### SGD Regression

In [10]:
#%%script false 

## Use Supper Vector Machines Algorithm
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
import warnings
warnings.filterwarnings('ignore')

# Can not implement classification cause for not unique feature
# text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge',penalty='l2',random_state=42))])

text_reg_svm = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-svm', SGDRegressor(penalty='l2',random_state=42,alpha=1e-3))])
sgd_model = text_reg_svm.fit(X_train, y_train)
predicted_svm = text_reg_svm.predict(X_test)


In [11]:
#%%script false 

# Cacluate Average Error Rate in Seconds
get_error_rate(predicted_svm, y_test)
#print("predict diff average:{:5f}\npredict diff variance:{:7f}\n".format(np.mean(abs(predicted_svm - y_test)), np.var(abs(predicted_svm - y_test))))

Predicted diff average:19.471370
predict diff variance:291.939576



### Decision Tree Regression

In [12]:
%%script false 

from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

text_dt_reg = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-dt', DecisionTreeRegressor(random_state=42))])
dt_model = text_dt_reg.fit(X_train, y_train)
predicted_dt = text_dt_reg.predict(X_test)


In [13]:
%%script false
# Cacluate Average Error Rate in Seconds
print("predict diff average:{:5f}\npredict diff variance:{:7f}\n".format(np.mean(abs(predicted_dt - y_test)), np.var(abs(predicted_dt - y_test))))

### RandomForest Regression

In [14]:
#%%script false

from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

text_rf_reg = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-rf', RandomForestRegressor(random_state=42,max_depth=4,n_jobs=-1,n_estimators=10,verbose=0))])
rf_model = text_rf_reg.fit(X_train, y_train)
predicted_rf = text_rf_reg.predict(X_test)


In [15]:
#%%script true
# Cacluate Average Error Rate in Seconds
get_error_rate(predicted_rf,y_test)


Predicted diff average:19.844678
predict diff variance:297.943198



### Pickle을 사용해서 ML Engine 으로 전송하기 위한 Model 저장

In [20]:
import pickle
filename = 'model.pkl'
pickle.dump(rf_model, open(filename,'wb'))


### Parameter Tunning

In [21]:
%%script false 

from sklearn.model_selection import GridSearchCV
parameters = {
#               'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__min_df':[5,20],
#               'tfidf__use_idf': (True, False),
              'reg-svm__alpha': (1e-2, 1e-3),
             }

gs_clf = GridSearchCV(text_reg_svm, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [18]:
%%script false 
gs_clf.best_score_    
gs_clf.best_params_

### Real World Testing

In [22]:
# Real world testing , stackoverflow queries after dataset being created.
test_query = {'title':['JavaScript why this mod operation return undefined [on hold]',
                       'BigQuery authorization',
                       'Replace function not working for long query in ORACLE/PLSQL?'],
              'response':[6,240,40]}
df = pd.DataFrame(data=test_query)
test_predict = text_rf_reg.predict(df.title)

i=0
for _,row in df.iterrows():
  print("{:70s} | real response:{:10.3f} | predict :{:10.3f} | diff :{:10.3f} \n".format(row.title, row.response,test_predict[i],abs(test_predict[i] - row.response)))
  i=i+1

JavaScript why this mod operation return undefined [on hold]           | real response:     6.000 | predict :    18.778 | diff :    12.778 

BigQuery authorization                                                 | real response:   240.000 | predict :    25.077 | diff :   214.923 

Replace function not working for long query in ORACLE/PLSQL?           | real response:    40.000 | predict :    25.077 | diff :    14.923 

