# Predict response time on Stackoverflow questions based on Bigquery stackoverflow public dataset

### Fetch Data from bigquery

In [33]:
# Imports the Google Cloud client library
from google.cloud import bigquery

# Instantiates a client
bigquery_client = bigquery.Client(project='hackertone-216701')


In [76]:
# Adjust WHERE clause based on trainning volume, avg_ans_sec <= 7200 took 5min 
QUERY = """
    SELECT title,avg_ans_sec FROM `stackoverflow_summary.post_accepted_answers` WHERE avg_ans_sec <= 3*3600 """

# Or, querying dataset based on input words
# QUERY = """
#     SELECT title,avg_ans_sec FROM `stackoverflow_summary.post_accepted_answers` WHERE lower(title) like '%javascript%' """


In [77]:
import pandas as pd
from pandas import read_gbq as gbq

project_id = 'hackertone-216701'
post_accepted_answers = gbq(QUERY,project_id=project_id)

Requesting query... ok.
Job ID: 4e7a225e-6f56-452f-bc8f-c0f64cd6e7b0
Query running...
  Elapsed 7.04 s. Waiting...
  Elapsed 8.19 s. Waiting...
  Elapsed 9.33 s. Waiting...
  Elapsed 10.45 s. Waiting...
  Elapsed 11.59 s. Waiting...
  Elapsed 12.74 s. Waiting...
  Elapsed 13.87 s. Waiting...
  Elapsed 15.0 s. Waiting...
  Elapsed 16.12 s. Waiting...
  Elapsed 17.25 s. Waiting...
  Elapsed 18.39 s. Waiting...
  Elapsed 19.5 s. Waiting...
  Elapsed 20.62 s. Waiting...
  Elapsed 21.76 s. Waiting...
  Elapsed 22.91 s. Waiting...
  Elapsed 24.05 s. Waiting...
  Elapsed 25.18 s. Waiting...
  Elapsed 26.32 s. Waiting...
  Elapsed 27.46 s. Waiting...
  Elapsed 28.58 s. Waiting...
  Elapsed 29.74 s. Waiting...
  Elapsed 30.88 s. Waiting...
  Elapsed 32.02 s. Waiting...
  Elapsed 33.16 s. Waiting...
  Elapsed 34.33 s. Waiting...
  Elapsed 35.46 s. Waiting...
  Elapsed 36.58 s. Waiting...
  Elapsed 37.72 s. Waiting...
Query done.
Processed: 399.0 MB Billed: 400.0 MB
Standard price: $0.00 USD

Ret

In [36]:
# Normalize or Scale avg_ans_sec for performance improvement 
# If scale the avg_ans_sec, not easy to reverse values to original, if we reduce avg_ans_sec under 10 thousands prediction is not bad 
# from sklearn.preprocessing import scale

# post_accepted_answers['avg_ans_sec_norm'] = scale(post_accepted_answers.avg_ans_sec.values)
# post_accepted_answers.dtypes


### Split into Training and Test Set

In [78]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into train and test set 
X_train , X_test, y_train, y_test = train_test_split(post_accepted_answers.title, post_accepted_answers.avg_ans_sec, test_size=0.30, random_state=42) 

In [38]:
# Vectorize title to words
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer(lowercase=True, min_df=5)
# X_train_counts = count_vect.fit_transform(X_train)
# X_train_counts.shape

In [39]:
# Weighting title based on TF-IDF 
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [79]:
## Use Pipeline for less code 
from sklearn.pipeline import Pipeline
import numpy as np

### SGD Regression

In [80]:
## Use Supper Vector Machines Algorithm
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
import warnings
warnings.filterwarnings('ignore')

# Can not implement classification cause for not unique feature
# text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge',penalty='l2',random_state=42))])

text_reg_svm = Pipeline([('vect', CountVectorizer(stop_words='english',min_df=20)), ('tfidf', TfidfTransformer(use_idf=True)), ('reg-svm', SGDRegressor(penalty='l2',random_state=42,alpha=1e-3))])
_ = text_reg_svm.fit(X_train, y_train)
predicted_svm = text_reg_svm.predict(X_test)


In [82]:
# Cacluate Average Error Rate in Seconds
print("predict diff average:{}\npredict diff variance:{}\n".format(np.mean(abs(predicted_svm - y_test)), np.var(abs(predicted_svm - y_test))))

predict diff average:1595.9731167385085
predict diff variance:2318736.200456321



### Parameter Tunning

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
#               'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__min_df':[5,20],
#               'tfidf__use_idf': (True, False),
              'reg-svm__alpha': (1e-2, 1e-3),
             }

gs_clf = GridSearchCV(text_reg_svm, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [63]:
gs_clf.best_score_    
gs_clf.best_params_

{'vect__min_df': 20}

### Real World Testing

In [74]:
# Real world testing , stackoverflow queries after dataset being created.
test_query = {'title':['JavaScript why this mod operation return undefined [on hold]',
                       'BigQuery authorization',
                       'Replace function not working for long query in ORACLE/PLSQL?'],
              'response':[360,14700,2400]}
df = pd.DataFrame(data=test_query)
test_predict = text_reg_svm.predict(df.title)

i=0
for _,row in df.iterrows():
  print("{:70s} | real response:{:10.3f} | predict :{:10.3f} | diff :{:10.3f} \n".format(row.title, row.response,test_predict[i],test_predict[i] - row.response))
  i=i+1

JavaScript why this mod operation return undefined [on hold]           | real response:   360.000 | predict :   835.059 | diff :   475.059 

BigQuery authorization                                                 | real response: 14700.000 | predict :  1080.060 | diff :-13619.940 

Replace function not working for long query in ORACLE/PLSQL?           | real response:  2400.000 | predict :   939.838 | diff : -1460.162 

