## Load Required Library

In [1]:
import pandas as pd
import os
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

## Data Preparation

### 01-Read Raw Data

In [2]:
path = '/Users/jun/Desktop/Text_Mining/Submission/'
data = pd.read_csv(path + 'StackOverflow.csv')
len(data)

159207

In [3]:
data.head()

Unnamed: 0,Question,Answer,Class
0,Should I use nested classes in this case?,I would be a bit reluctant to use nested class...,c++
1,How do I connect to a database and loop over a...,Very roughly and from memory since I don't hav...,c#
2,"How to get the value of built, encoded ViewState?","Rex, I suspect a good place to start looking i...",c#
3,How do I delete a file which is locked by anot...,"You can use this program, Handle, to find whic...",c#
4,.NET Unit Testing packages?,"I like MbUnit, er, Gallio. Most importantly t...",c#


### 02-Data Preprocessing

In [4]:
def pre_process(questions):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    questions = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in questions]
    # Tokenlization
    questions_tokens = [nltk.word_tokenize(t) for t in questions]
    # Removing Stop Words
    questions_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in questions_tokens]
    
    questions_stop = pd.Series(questions_stop)
    return questions_stop

In [5]:
# Initial preprocessing training data
questions = data['Question']
questions_pp = pre_process(questions)

In [6]:
data_tokens = pd.DataFrame({'Question': list(data['Question']),
                            'Question_Tokens': questions_pp,
                            'Answer': list(data['Answer']),
                            'Class': list(data['Class'])
                           })
data_tokens.head()

Unnamed: 0,Answer,Class,Question,Question_Tokens
0,I would be a bit reluctant to use nested class...,c++,Should I use nested classes in this case?,"[nested, classes, case]"
1,Very roughly and from memory since I don't hav...,c#,How do I connect to a database and loop over a...,"[connect, database, loop, recordset]"
2,"Rex, I suspect a good place to start looking i...",c#,"How to get the value of built, encoded ViewState?","[value, built, encoded, viewstate]"
3,"You can use this program, Handle, to find whic...",c#,How do I delete a file which is locked by anot...,"[delete, file, locked, another, process]"
4,"I like MbUnit, er, Gallio. Most importantly t...",c#,.NET Unit Testing packages?,"[unit, testing, packages]"


### 03-Example

In [7]:
data_example = pd.DataFrame(data_tokens['Question'])
length = data_example['Question'].apply(len)
data_example = data_example.assign(Question_Length=length)
data_example.head()

Unnamed: 0,Question,Question_Length
0,Should I use nested classes in this case?,41
1,How do I connect to a database and loop over a...,63
2,"How to get the value of built, encoded ViewState?",49
3,How do I delete a file which is locked by anot...,64
4,.NET Unit Testing packages?,27


In [8]:
# Raw data
example = data_example['Question'][1]
raw_title = 'Raw Data'
raw_result = example
raw_result

'How do I connect to a database and loop over a recordset in C#?'

In [9]:
# Remove non english words
re_title = 'Remove non-English Words'
re_result = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in pd.Series(example)]
re_result

['how do i connect to a database and loop over a recordset in c# ']

In [10]:
# Tokenlization
tk_title = 'Tokenlization'
tk_result = [nltk.word_tokenize(t) for t in re_result]
print(tk_result)

[['how', 'do', 'i', 'connect', 'to', 'a', 'database', 'and', 'loop', 'over', 'a', 'recordset', 'in', 'c', '#']]


In [11]:
# Removing Stop Words
stop_words = stopwords.words("english")
rs_title = 'Removing Stop Words'
rs_result = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)] for tokens in tk_result]
rs_result

[['connect', 'database', 'loop', 'recordset']]

In [12]:
data = {'Step' : [raw_title, re_title, tk_title, rs_title],
        'Results' : [raw_result, re_result, tk_result, rs_result]}
df = pd.DataFrame(data)
cols = ['Step', 'Results']
df = df.ix[:,cols]
pd.set_option('display.max_colwidth', 100)
df

Unnamed: 0,Step,Results
0,Raw Data,How do I connect to a database and loop over a recordset in C#?
1,Remove non-English Words,[how do i connect to a database and loop over a recordset in c# ]
2,Tokenlization,"[[how, do, i, connect, to, a, database, and, loop, over, a, recordset, in, c, #]]"
3,Removing Stop Words,"[[connect, database, loop, recordset]]"


### 03-Train Word2Vec

In [13]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=2)
    return model

In [14]:
dict_language = {'0': 'python', '1': 'c++', '2': 'c#', '3': 'java', '4': 'ios', '5': 'android', '6': 'html', 
                 '7': 'jquery', '8': 'php', '9': 'javascript'}

data_tokens['Question_Vectors'] = None
data_tokens['Average_Pooling'] = None
    
for key, value in dict_language.items():
    questions_data = list(data_tokens[data_tokens['Class'] == value]['Question_Tokens'])
    # Train model
    model_name = 'word2vec_model_' + value
    trained_model = train_model(questions_data)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = path + 'stackoverflow_word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
    
    # Calculate the vectors for each question
    for i in range(len(data_tokens)):
        if data_tokens['Class'][i] == value:
            question_tokens = data_tokens['Question_Tokens'][i]
            question_vectors = []
            for token in question_tokens:
                try:
                    vector = model[token]
                    question_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            data_tokens['Question_Vectors'][i] = question_vectors
            # Average Pooling of all tokens
            data_tokens['Average_Pooling'][i] = list(pd.DataFrame(question_vectors).mean())

Saved word2vec_model_python model successfully
Saved word2vec_model_c++ model successfully
Saved word2vec_model_c# model successfully
Saved word2vec_model_java model successfully
Saved word2vec_model_ios model successfully
Saved word2vec_model_android model successfully
Saved word2vec_model_html model successfully
Saved word2vec_model_jquery model successfully
Saved word2vec_model_php model successfully
Saved word2vec_model_javascript model successfully


In [15]:
data_tokens['Question_Tokens'] = [" ".join(l) for l in data_tokens['Question_Tokens']]
length = data_tokens['Question_Tokens'].apply(len)
data_tokens = data_tokens.assign(Question_Length=length)
data_tokens.head()

Unnamed: 0,Answer,Class,Question,Question_Tokens,Question_Vectors,Average_Pooling,Question_Length
0,I would be a bit reluctant to use nested classes here. What if you created an abstract base cla...,c++,Should I use nested classes in this case?,nested classes case,"[[-0.016859842, -0.08772416, 0.08563595, 0.0010534023, 0.036454305, -0.13445053, -0.15765251, 0....","[-0.029333777104814846, -0.1482729290922483, 0.14302914589643478, 0.0034336356135706105, 0.06050...",19
1,Very roughly and from memory since I don't have code on this laptop: using (OleDBConnection conn...,c#,How do I connect to a database and loop over a recordset in C#?,connect database loop recordset,"[[-0.03547492, -0.038787197, 0.19807386, -0.06702984, -0.029457415, -0.124708235, -0.21098334, 0...","[-0.03985609917435795, -0.047758323344169185, 0.2415241110138595, -0.08139146049506962, -0.03629...",31
2,"Rex, I suspect a good place to start looking is solutions that compress the ViewState -- they're...",c#,"How to get the value of built, encoded ViewState?",value built encoded viewstate,"[[-0.025093978, -0.06378416, 0.41903126, -0.10726863, -0.08921247, -0.21602507, -0.41209874, 0.4...","[-0.02130800811573863, -0.03576449351385236, 0.19821946881711483, -0.05877491505816579, -0.03670...",29
3,"You can use this program, Handle, to find which process has the lock on your file. It's a comman...",c#,How do I delete a file which is locked by another process in C#?,delete file locked another process,"[[-0.046049472, -0.064302094, 0.29406315, -0.0937515, -0.04822064, -0.17352177, -0.2984529, 0.34...","[-0.05904793739318848, -0.07592802681028843, 0.3135317489504814, -0.10978017188608646, -0.054653...",34
4,"I like MbUnit, er, Gallio. Most importantly to me is having good tools support inside Visual St...",c#,.NET Unit Testing packages?,unit testing packages,"[[-0.07865145, -0.090446755, 0.3422031, -0.12995343, -0.029050283, -0.22178338, -0.3473875, 0.40...","[-0.046735359355807304, -0.05224954647322496, 0.2244433437784513, -0.08121263297895591, -0.02939...",21


In [16]:
# Export as data as JSON
data_json = json.loads(data_tokens.to_json(orient='records'))

with open(path + 'StackOverflow_Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile)

## Word2Vec

In [17]:
try:
    stackoverflow_path = path + 'StackOverflow_Word2Vec.json'

    with open(stackoverflow_path) as file:
        reader = json.load(file)

        classes = []
        questions = []
        questions_tokens = []
        answers = []
        question_lengths = []
        question_vectors = []
        average_pooling = []
        for row in reader:
            classes.append(row['Class'])
            questions.append(row['Question'])
            questions_tokens.append(row['Question_Tokens'].split())
            answers.append(row['Answer'])
            question_lengths.append(row['Question_Length'])
            question_vectors.append(row['Question_Vectors'])
            average_pooling.append(row['Average_Pooling'])

        data_tokens = pd.DataFrame({'Class': classes,
                                    'Question': questions,
                                    'Question_Tokens': questions_tokens,
                                    'Answer': answers,
                                    'Question_Length': question_lengths,
                                    'Question_Vectors': question_vectors,
                                    'Average_Pooling': average_pooling})
except:
    pass

data_tokens.head()

Unnamed: 0,Answer,Average_Pooling,Class,Question,Question_Length,Question_Tokens,Question_Vectors
0,I would be a bit reluctant to use nested classes here. What if you created an abstract base cla...,"[-0.0293337771, -0.1482729291, 0.1430291459, 0.0034336356, 0.0605062035, -0.2105257114, -0.25269...",c++,Should I use nested classes in this case?,19,"[nested, classes, case]","[[-0.0168598425, -0.0877241567, 0.0856359527, 0.0010534023, 0.0364543051, -0.134450525, -0.15765..."
1,Very roughly and from memory since I don't have code on this laptop: using (OleDBConnection conn...,"[-0.0398560992, -0.0477583233, 0.241524111, -0.0813914605, -0.0362919552, -0.1510911557, -0.2470...",c#,How do I connect to a database and loop over a recordset in C#?,31,"[connect, database, loop, recordset]","[[-0.0354749188, -0.0387871973, 0.198073864, -0.0670298412, -0.0294574145, -0.1247082353, -0.210..."
2,"Rex, I suspect a good place to start looking is solutions that compress the ViewState -- they're...","[-0.0213080081, -0.0357644935, 0.1982194688, -0.0587749151, -0.0367041468, -0.1091237972, -0.198...",c#,"How to get the value of built, encoded ViewState?",29,"[value, built, encoded, viewstate]","[[-0.0250939783, -0.0637841597, 0.4190312624, -0.1072686315, -0.0892124698, -0.2160250694, -0.41..."
3,"You can use this program, Handle, to find which process has the lock on your file. It's a comman...","[-0.0590479374, -0.0759280268, 0.313531749, -0.1097801719, -0.0546533419, -0.2038205348, -0.3234...",c#,How do I delete a file which is locked by another process in C#?,34,"[delete, file, locked, another, process]","[[-0.0460494719, -0.0643020943, 0.2940631509, -0.0937514976, -0.0482206382, -0.173521772, -0.298..."
4,"I like MbUnit, er, Gallio. Most importantly to me is having good tools support inside Visual St...","[-0.0467353594, -0.0522495465, 0.2244433438, -0.081212633, -0.0293975668, -0.144603507, -0.22993...",c#,.NET Unit Testing packages?,21,"[unit, testing, packages]","[[-0.0786514506, -0.0904467553, 0.3422031105, -0.1299534291, -0.0290502831, -0.2217833847, -0.34..."


In [18]:
# Greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [23]:
def Talk_To_Javris(data_language, model):
    
    # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) 

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.9:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)

        # Find Top Questions and Score
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])

        reply = str(data_language.iloc[:,0][r_index])
        
        return reply, r_score

In [24]:
flag_language = True
flag_query = True
dict_language = {'0': 'python', '1': 'c++', '2': 'c#', '3': 'java', '4': 'ios', '5': 'android', '6': 'html', 
                 '7': 'jquery', '8': 'php', '9': 'javascript'}

print('......................................................................................')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'My name is Jarvis, a Programming Language Apprentice Bot.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'I will try my best to answer your query.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Please select which language you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 0 > for python     < 1 > for c++      < 2 > for c#')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 3 > for java       < 4 > for ios      < 5 > for android')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 6 > for html       < 7 > for jquery   < 8 > for php')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 9 > for javascript')
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_language = data_tokens[data_tokens['Class'] == language]
            data_language = pd.DataFrame({'Question': list(data_language['Question']),
                                          'Question_Tokens': list(data_language['Question_Tokens']),
                                          'Answer': list(data_language['Answer']),
                                          'Class': list(data_language['Class']),
                                          'Question_Vectors': list(data_language['Question_Vectors']),
                                          'Average_Pooling': list(data_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = path + 'stackoverflow_word2vec_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print("......................................................................................")
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            reply, score = Talk_To_Javris(data_language, model)
            print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+reply)

            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: " + str(score))
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + 'Bye! Hope that i am of help.') 

......................................................................................
[1;37;40mJarvis[0m: My name is Jarvis, a Programming Language Apprentice Bot.
[1;37;40mJarvis[0m: I will try my best to answer your query.
[1;37;40mJarvis[0m: If you want to exit, you can type < bye >.
......................................................................................
[1;37;40mJarvis[0m: Please select which language you want to enquire, you can type:
[1;37;40mJarvis[0m: < 0 > for python     < 1 > for c++      < 2 > for c#
[1;37;40mJarvis[0m: < 3 > for java       < 4 > for ios      < 5 > for android
[1;37;40mJarvis[0m: < 6 > for html       < 7 > for jquery   < 8 > for php
[1;37;40mJarvis[0m: < 9 > for javascript
......................................................................................
USER  :2
......................................................................................
...........................................................................