In [79]:
!pip install --upgrade google-cloud-bigquery

Requirement already up-to-date: google-cloud-bigquery in /usr/local/envs/py3env/lib/python3.5/site-packages (1.5.1)


## Predict response time on Stackoverflow questions based on Bigquery stackoverflow public dataset

In [80]:
# Imports the Google Cloud client library
from google.cloud import bigquery

# Instantiates a client
bigquery_client = bigquery.Client(project='hackertone-216701')


In [81]:
# Adjust WHERE clause based on trainning volume, avg_ans_sec <= 7200 took 5min 
QUERY = """
    SELECT title,avg_ans_sec FROM `stackoverflow_summary.post_accepted_answers` WHERE avg_ans_sec <= 7200 """


In [82]:
import pandas as pd
from pandas import read_gbq as gbq

project_id = 'hackertone-216701'
post_accepted_answers = gbq(QUERY,project_id=project_id)

Requesting query... ok.
Job ID: f397b589-5e58-42d8-94ec-86c1d87b2532
Query running...
  Elapsed 7.05 s. Waiting...
  Elapsed 8.18 s. Waiting...
  Elapsed 9.32 s. Waiting...
  Elapsed 10.44 s. Waiting...
  Elapsed 11.57 s. Waiting...
  Elapsed 12.71 s. Waiting...
  Elapsed 13.83 s. Waiting...
  Elapsed 14.96 s. Waiting...
  Elapsed 16.08 s. Waiting...
  Elapsed 17.21 s. Waiting...
  Elapsed 18.33 s. Waiting...
  Elapsed 19.47 s. Waiting...
  Elapsed 20.62 s. Waiting...
  Elapsed 21.76 s. Waiting...
  Elapsed 22.88 s. Waiting...
  Elapsed 23.99 s. Waiting...
  Elapsed 25.12 s. Waiting...
  Elapsed 26.24 s. Waiting...
  Elapsed 27.36 s. Waiting...
  Elapsed 28.49 s. Waiting...
  Elapsed 29.61 s. Waiting...
  Elapsed 30.76 s. Waiting...
  Elapsed 31.89 s. Waiting...
Query done.
Processed: 399.0 MB Billed: 400.0 MB
Standard price: $0.00 USD

Retrieving results...
Got 5500222 rows.

Total time taken 296.62 s.
Finished at 2018-10-01 06:34:34.


In [83]:
# Normalize or Scale avg_ans_sec for performance improvement 
# If scale the avg_ans_sec, not easy to reverse values to original, if we reduce avg_ans_sec under 10 thousands prediction is not bad 
# from sklearn.preprocessing import scale

# post_accepted_answers['avg_ans_sec_norm'] = scale(post_accepted_answers.avg_ans_sec.values)
# post_accepted_answers.dtypes


In [84]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into train and test set 
X_train , X_test, y_train, y_test = train_test_split(post_accepted_answers.title, post_accepted_answers.avg_ans_sec, test_size=0.33, random_state=42) 

In [101]:
# Vectorize title to words
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(lowercase=True, min_df=5)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(3685148, 44323)

In [100]:
# Weighting title based on TF-IDF 
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3685148, 213423)

In [87]:
## Use Pipeline for less code 
from sklearn.pipeline import Pipeline
import numpy as np

### SGD Regression

In [102]:
## Use Supper Vector Machines Algorithm
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor

# Can not implement classification cause for not unique feature
# text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge',penalty='l2',random_state=42))])

text_reg_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('reg-svm', SGDRegressor(penalty='l2',random_state=42))])
_ = text_reg_svm.fit(X_train, y_train)
predicted_svm = text_reg_svm.predict(X_test)




In [96]:
# Cacluate Average Error Rate in Seconds
print("predict diff average:{}\npredict diff variance:{}\n".format(np.mean(predicted_svm - y_test), np.var(predicted_svm - y_test)))

predict diff average:-4.269388626194557
predict diff variance:2379537.402369696



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Can not implement classification cause for not unique feature
# text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge',penalty='l2',random_state=42))])

text_log_reg = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('log-reg', LogisticRegression(random_state=42))])
_ = text_log_reg.fit(X_train, y_train)
predicted_lr = text_log_reg.predict(X_test)


In [None]:
# Cacluate Average Error Rate in Seconds
print("predict diff average:{}\npredict diff variance:{}\n".format(np.mean(predicted_lr - y_test), np.var(predicted_lr - y_test)))

### Real World Testing

In [97]:
# Real world testing , stackoverflow queries after dataset being created.
test_query = {'title':['JavaScript why this mod operation return undefined [on hold]',
                       'BigQuery authorization',
                       'Replace function not working for long query in ORACLE/PLSQL?'],
              'response':[360,14700,2400]}
df = pd.DataFrame(data=test_query)
test_predict = text_reg_svm.predict(df.title)

i=0
for _,row in df.iterrows():
  print("{:70s} | real response:{:10.3f} | predict :{:10.3f} | diff :{:10.3f} \n".format(row.title, row.response,test_predict[i],test_predict[i] - row.response))
  i=i+1

JavaScript why this mod operation return undefined [on hold]           | real response:   360.000 | predict :  1066.657 | diff :   706.657 

BigQuery authorization                                                 | real response: 14700.000 | predict :  1876.356 | diff :-12823.644 

Replace function not working for long query in ORACLE/PLSQL?           | real response:  2400.000 | predict :  1406.725 | diff :  -993.275 



In [91]:
# BigQuery authorization