In [1]:
import sys
sys.path.append("..")
from download_utils import download_project_resources
download_project_resources()

File data\dialogues.tsv is already downloaded.
File data\tagged_posts.tsv is already downloaded.


In [2]:
from utils import *
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import numpy as np
import pandas as pd
import pickle
import re
import utils


In [4]:
from utils import RESOURCE_PATH

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
from sklearn.multiclass import OneVsRestClassifier

In [8]:
from sklearn import preprocessing

## Intent and language recognization

In [9]:
def tfidf_features(X_train, X_test, vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    
    # Train a vectorizer on X_train data.
    # Transform X_train and X_test data.
    
    # Pickle the trained vectorizer to 'vectorizer_path'
    # Don't forget to open the file in writing bytes mode.
    
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2),
                                       token_pattern='(\S+)')
    X_train=tfidf_vectorizer.fit_transform(X_train)
    X_test=tfidf_vectorizer.transform(X_test)
    with open(vectorizer_path,'wb') as vectorizer_file:
        pickle.dump(tfidf_vectorizer,vectorizer_file)
    
    return X_train, X_test

In [10]:
sample_size = 200000
diag_df = pd.read_csv('data/dialogues.tsv', sep= '\t').sample(sample_size, 
                                                        random_state=0)
so_df = pd.read_csv('data/tagged_posts.tsv', sep= '\t').sample(sample_size, 
                                                         random_state= 0)

In [11]:
diag_df.head()

Unnamed: 0,text,tag
82925,"Donna, you are a muffin.",dialogue
48774,He was here last night till about two o'clock....,dialogue
55394,"All right, then make an appointment with her s...",dialogue
90806,"Hey, what is this-an interview? We're supposed...",dialogue
107758,Yeah. He's just a friend of mine I was trying ...,dialogue


In [12]:
so_df.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,Efficient Algorithm to compose valid expressio...,python
1084095,15747223,Why does this basic thread program fail with C...,c_cpp
1049020,15189594,Link to scroll to top not working,javascript
200466,3273927,Is it possible to implement ping on windows ph...,c#
1200249,17684551,GLSL normal mapping issue,c_cpp


In [13]:
#diag_df['text']=  diag_df['text'].apply(text_prepare)
#so_df['title'] = so_df['title'].apply(text_prepare)

In [14]:
x = np.concatenate([diag_df['text'].values, so_df['title'].values])
y = ['dialogue']*diag_df.shape[0] + ['stackoverflow']*so_df.shape[0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, 
                                                    random_state = 0)
print('Train size= {}, test size= {}' .format(len(x_train), len(x_test)))

X_train_tfidf, X_test_tfidf = tfidf_features(x_train, x_test, RESOURCE_PATH['TFIDF_VECTORIZER'])

Train size= 360000, test size= 40000


In [15]:
intent_recognizer = LogisticRegression(solver='newton-cg', C=10, 
                                       penalty = 'l2', n_jobs = -1)
intent_recognizer.fit(X_train_tfidf, y_train)

In [16]:
# Check test accuracy.
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.9956


In [17]:
#dump classifier in running bot
pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

## Programming language classification

In [18]:
x = so_df['title'].values
y = so_df['tag'].values

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.2,
                                                   random_state= 0)
print('Train size= {}. test_size= {}'.format(len(x_train), len(x_test)))

Train size= 160000. test_size= 40000


In [20]:
vectorizer= pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))
x_train_tfidf, x_test_tfidf = vectorizer.transform(x_train), vectorizer.transform(x_test)

In [21]:
lr = LogisticRegression(solver = 'newton-cg', C=5, penalty='l2', 
                        n_jobs= -1)
######################################
tag_classifier = OneVsRestClassifier(lr)
tag_classifier.fit(x_train_tfidf, y_train)

In [22]:
#check test accuracy
y_test_pred = tag_classifier.predict(x_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy = {}".format(test_accuracy))

Test accuracy = 0.7718


In [23]:
pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

## Ranking questions with embeddings

In [24]:
starspace_embeddings, embeddings_dim = load_embeddings('word_embeddings.tsv')

FileNotFoundError: [Errno 2] No such file or directory: 'word_embeddings.tsv'

In [None]:
posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t')

In [None]:
counts_by_tag = posts_df.groupby(posts_df['tag']).count()
counts_by_tag.items()

In [None]:
counts_by_tag = posts_df['tag'].value_counts().to_dict()

In [None]:
import os
os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)

for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id'].values
    
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title,starspace_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))