## Quick Approach to Question/Answer Classification.

This is a first attempt to build a classification model that would distinguish between questions and answers on StackOverflow based on the textual data only.

In [None]:
import numpy as np
import pandas as pd
import os
import bq_helper
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [78]:
from sklearn.model_selection import train_test_split

In [90]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [82]:
from keras.models import Sequential
from keras.layers import Dense

In [112]:
from IPython.display import SVG
from keras.utils import model_to_dot

ImportError: cannot import name 'model_to_dot'

In [None]:
# create a helper object for this dataset
stack_overflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                              dataset_name="stackoverflow")

In [None]:
stack_overflow.list_tables()

In [None]:
stack_overflow.head('posts_answers')

In [None]:
# load existing dataset with all questions from 2018
df_all_question_2018 = pd.read_csv('../input/stackoverflow/all_stackoverflow_questions_with_time_to_answer_2018.csv')

In [None]:
# set id as an index
df_all_question_2018.set_index('id', inplace=True)

In [66]:
df_all_question_2018.head()

Unnamed: 0_level_0,title,body,answer_count,creation_date,owner_user_id,tags,score,first_ans_date,tta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
49972225,trying to write structure to file but keep get...,<p>I attempted to create a program that opens ...,0,2018-04-23 01:18:36.960000+00:00,9399752.0,c++|dev-c++,0,,
52063301,Generated apk crashes out,<p>I've just finished my last android project ...,2,2018-08-28 17:19:36.097000+00:00,10245431.0,android,0,2018-08-28 18:13:13.477000+00:00,3217.0
48815044,mapping list source to target,<p>I have an object organization that contains...,1,2018-02-15 19:40:27.830000+00:00,3469948.0,spring-boot|mapstruct,0,2018-02-15 22:44:57.220000+00:00,11069.0
51941863,Passing YAML key values to Python,<p>I have a YAML file i.e. <code>test.yml</cod...,0,2018-08-21 05:03:52.520000+00:00,3150734.0,python|yaml,0,,
50679866,implementation of BST algorithm,<p>I want to transform this algorithm in pytho...,1,2018-06-04 11:46:33.033000+00:00,8346789.0,python|binary-search-tree,0,2018-06-04 14:21:03.790000+00:00,9270.0


In [None]:
"""
This is a query to extract answers data that correspond to
the first answers to 2018 questions.
"""
qu_earliest_ans_body = """
WITH answers_table AS
(
SELECT 
    a.parent_id AS que_id,
    a.creation_date AS ans_date,
    a.body AS ans_body,
    a.id AS ans_id
FROM
    `bigquery-public-data.stackoverflow.posts_answers` AS a
INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q
ON q.id=a.parent_id
WHERE
    a.parent_id IN (SELECT id FROM `bigquery-public-data.stackoverflow.posts_questions` 
    WHERE EXTRACT(YEAR FROM creation_date)=2018)
)
SELECT 
       que_id,
       ans_date,
       ans_body,
       ans_id
FROM
answers_table
WHERE
CONCAT(CAST(que_id AS STRING), CAST(ans_date AS STRING))  IN 
    (SELECT CONCAT(CAST(que_id AS STRING), CAST(MIN(ans_date) AS STRING)) FROM answers_table GROUP BY que_id);
"""

In [None]:
df_answers = stack_overflow.query_to_pandas_safe(qu_earliest_ans_body, max_gb_scanned=21)

In [50]:
df_answers.head()

Unnamed: 0,que_id,ans_date,ans_body,ans_id
0,53173885,2018-11-08 10:59:02.707000+00:00,<p>I did a mistake that I forgot to add the im...,53206347
1,51273342,2018-07-10 20:37:38.567000+00:00,"<p>Copying the files (and the database, of cou...",51273542
2,49294634,2018-03-15 13:40:59.967000+00:00,<p>You shouldn't use Loop controller to loop t...,49301004
3,53802757,2018-12-16 14:00:57.277000+00:00,<p>Based on your code I think you are trying t...,53802868
4,52348009,2018-09-15 19:32:05.350000+00:00,<p>Forget to add default storeview to webserve...,52348162


In [67]:
len(df_answers), len(df_answers.que_id.unique())

(1540106, 1540105)

In [53]:
df_answers.que_id.value_counts().head()

51490721    2
50333695    1
48145435    1
53570424    1
52023867    1
Name: que_id, dtype: int64

In [54]:
# one collision on answer date is observed
df_answers[df_answers.que_id==51490721]

Unnamed: 0,que_id,ans_date,ans_body,ans_id
7776,51490721,2018-07-24 05:08:15.377000+00:00,<p>Please use PATCH in routes for example :</p...,51490841
10417,51490721,2018-07-24 05:08:15.377000+00:00,"<p>your method is post , but you send patch re...",51490842


In [None]:
df_all_question_2018.head()

In [None]:
# sanity check
(df_answers.ans_date.isin(df_all_question_2018.first_ans_date)).sum()

### Building Classification Model

#### Constructing features and labels

We will vectorize the texts based on TF-iDF vectorizer:

In [71]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=1000, max_df=1.e6, token_pattern=r'(?u)\b[A-Za-z]+\b')

In [72]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1000000.0, max_features=None, min_df=1000,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b[A-Za-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [57]:
len(df_all_question_2018), len(df_answers)

(2168086, 1540106)

In [73]:
X_que_ans = vectorizer.fit_transform(np.concatenate([df_all_question_2018.body, df_answers.ans_body]))

In [74]:
X_que_ans.shape

(3708192, 10083)

In [75]:
print(vectorizer.get_feature_names()[:1000])

['aa', 'aaa', 'aaaa', 'aac', 'aad', 'aar', 'ab', 'abc', 'abcd', 'abi', 'ability', 'able', 'abort', 'aborted', 'aborting', 'abs', 'absence', 'absent', 'absolute', 'absolutely', 'abspath', 'abstract', 'abstractapplicationcontext', 'abstractautowirecapablebeanfactory', 'abstractbeanfactory', 'abstraction', 'abstractprotocol', 'ac', 'acc', 'acceleration', 'accent', 'accept', 'acceptable', 'accepted', 'accepting', 'accepts', 'access', 'accesscontroller', 'accessed', 'accesses', 'accessibility', 'accessible', 'accessing', 'accessor', 'accesstoken', 'accidentally', 'accommodate', 'accomplish', 'accomplished', 'according', 'accordingly', 'accordion', 'account', 'accountid', 'accounting', 'accounts', 'accumulate', 'accumulator', 'accuracy', 'accurate', 'accurately', 'ace', 'acf', 'achieve', 'achieved', 'achieving', 'achive', 'ack', 'acl', 'acme', 'acquire', 'acquired', 'act', 'acting', 'action', 'actionbar', 'actionbarsize', 'actionbutton', 'actionevent', 'actionlink', 'actionlistener', 'action

In [76]:
# 0 will stand for questions , and 1 will stand for answers
y_que_ans = np.concatenate([np.zeros(len(df_all_question_2018.body)), np.ones(len(df_answers.ans_body))])

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_que_ans, y_que_ans, test_size=0.33, random_state=42)

In [101]:
y_que_ans.shape[0]

3708192

In [102]:
X_train_ix, X_test_ix, y_train_ix, y_test_ix = train_test_split(np.arange(X_que_ans.shape[0]), np.arange(X_que_ans.shape[0]), 
                                                                test_size=0.33, random_state=42)

In [104]:
#X_test_ix

#### Neural Net approach with Keras

Let's build a neural net with 2 hidden layers, 1 input layer and 1 output layer. Parameter `input_dim` should corresond to number of dimensions in our TFiDF matrix:

In [83]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=10083))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.


In [84]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['binary_accuracy'])

In [85]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                645376    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 653,761
Trainable params: 653,761
Non-trainable params: 0
_________________________________________________________________


In [86]:
model.fit(X_train, y_train, epochs=5, batch_size=32)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5d7000c278>

In [87]:
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)



In [88]:
loss_and_metrics

[0.27677469347346545, 0.8821136483990221]

In [89]:
y_test_pred = model.predict_classes(X_test)

In [91]:
accuracy_score(y_test, y_test_pred), precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)

(0.8821136483986324, 0.811809007844284, 0.9327756851232898)

In [113]:
accuracy_score(y_test, y_test_pred), precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)

(0.8821136483986324, 0.811809007844284, 0.9327756851232898)

In [92]:
query_text = """I am trying to figure out what's the best way to solve this problem. And I'm stuck."""

In [96]:
answer_text = """I have found this solution for you. It should work!"""

In [97]:
X_ans = vectorizer.transform([answer_text])

In [93]:
X_ = vectorizer.transform([query_text])

In [94]:
X_.shape

(1, 10083)

In [95]:
model.predict(X_)

array([[0.00055787]], dtype=float32)

In [98]:
model.predict(X_ans)

array([[0.9997286]], dtype=float32)

In [105]:
body_test = np.concatenate([df_all_question_2018.body, df_answers.ans_body])[X_test_ix]

In [107]:
body_test.shape, X_test.shape

((1223704,), (1223704, 10083))

In [108]:
X_body_test = vectorizer.transform(body_test)

In [117]:
X_body_test != X_test

<1223704x10083 sparse matrix of type '<class 'numpy.bool_'>'
	with 28744174 stored elements in Compressed Sparse Row format>

In [119]:
X_body_test.shape, X_test.shape

((1223704, 10083), (1223704, 10083))

In [126]:
(X_body_test[-1].todense() == X_test[-1].todense()).mean()

1.0

In [109]:
y_body_test = y_que_ans[y_test_ix]

In [110]:
y_body_test_pred = model.predict_classes(X_body_test)

In [129]:
accuracy_score(y_body_test, y_body_test_pred), precision_score(y_body_test, y_body_test_pred), recall_score(y_body_test, y_body_test_pred)

(0.8821136483986324, 0.811809007844284, 0.9327756851232898)

In [115]:
y_body_test_pred, y_test_pred

(array([[1],
        [0],
        [1],
        ...,
        [1],
        [0],
        [1]], dtype=int32), array([[1],
        [0],
        [1],
        ...,
        [1],
        [0],
        [1]], dtype=int32))

### Error Analysis