# Importing the required libraries

In [185]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None 

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [186]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [187]:
from bs4 import BeautifulSoup
from tqdm import tqdm

tqdm.pandas()

In [188]:
from collections import Counter
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel

# Importing the datasets

In [189]:
#The whole dataset is in the data folder in 5 different files
df1 = pd.read_csv('data/original_data1.csv')
df2 = pd.read_csv('data/original_data2.csv')
df3 = pd.read_csv('data/original_data3.csv')
df4 = pd.read_csv('data/original_data4.csv')
df5 = pd.read_csv('data/original_data5.csv')

In [190]:
#All the different data from differnt datasets is concatenated to form a single dataset
df = pd.concat([df1,df2,df3,df4,df5], axis=0)

In [191]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [192]:
df.reset_index(drop=True, inplace=True)

# Understanding the dataset

For this model, we want to label the answers, so we will be needing only the 'id', 'tags' and 'answers' columns. Other columns are ignored. The required columns are copied into another dataframe, so the original dataframe can be used when required. 

In [399]:
data = df[['id','tags','answers']]

In [210]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
id         100000 non-null int64
tags       100000 non-null object
answers    100000 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [211]:
data.isnull().sum()

id         0
tags       0
answers    0
dtype: int64

In [212]:
min_ans_len = data['answers'].str.len().min()
max_ans_len = data['answers'].str.len().max()

In [213]:
data[data['answers'].str.len()==min_ans_len]

Unnamed: 0,id,tags,answers
43053,4350260,python|inheritance|decorator,"<p>No, it doesn't.</p>"


In [214]:
data[data['answers'].str.len()==max_ans_len]

Unnamed: 0,id,tags,answers
87503,64587303,python|numpy|image-processing|convolution,<p>I've implemented several very fast solution...


# Text Processing

In [400]:
data['tags'] = data['tags'].str.replace('|',' ')
data['tags'] = data['tags'].str.replace('python-3.x','python')
data['tags'] = data['tags'].str.replace('python-2.7','python')

In [369]:
#def parser(text):
#    soup = BeautifulSoup(text, 'html.parser')
#    while(soup.code):
#        soup.code.decompose()
#    return soup.text

In [370]:
#removing html tags
data['answers'] = data['answers'].apply(lambda text: BeautifulSoup(text, "lxml").text)

In [371]:
#converting all letters to lowercase
data['answers'] = data['answers'].str.lower()
data['tags'] = data['tags'].str.lower()

In [372]:
#removing all punctuations
data['answers'] = data['answers'].str.replace('[^\w\s]',' ')
data['tags'] = data['tags'].str.replace('[^\w\s]',' ')

In [373]:
data['ans_tokenized'] = data['answers']

In [391]:
stem_word_list = [('np', 'numpy'), ('plt', 'plot'), ('plt', 'plot'), ('correctly', 'correct'), ('containing', 'contain'), ('better', 'good'),
                  ('best', 'good'), ('strings', 'string'), ('arrays', 'array'), ('variables', 'variable'), ('containing', 'contain'), 
                  ('calls', 'call'), ('called', 'call'), ('calling', 'call'), ('returns', 'return'), ('results', 'result'), ('values', 'value'), 
                  ('val', 'value'), ('images', 'image'), ('img', 'image'), ('png', 'image'), ('tuples', 'tuple'), ('arguments', 'argument'), 
                  ('args', 'argument'), ('argv', 'argument'), ('parameter', 'parameter'), ('params', 'parameter'), ('param', 'parameter'), 
                  ('prints', 'print'), ('lists', 'list'), ('runs', 'run'), ('models', 'model'), ('headers', 'header'), ('installation', 'install'), 
                  ('installed','install'), ('tf','tensorflow'), ('found', 'find'), ('works', 'work'), ('lines', 'line'), ('pd', 'pandas'),
                 ('df', 'dataframe'), ('lib', 'library')]

In [392]:
for (word, repl_word) in stem_word_list:
    data['ans_tokenized'] = data['ans_tokenized'].str.replace(word, repl_word)

AttributeError: Can only use .str accessor with string values!

In [None]:
#tokenizing text
data['ans_tokenized'] = data['answers'].apply(lambda text: [word for word in nltk.word_tokenize(text) if word.isalpha()])

In [393]:
for i in range(0,data.shape[0]):
    data['tags'][i] = data['tags'][i].split(' ')

AttributeError: 'list' object has no attribute 'split'

In [394]:
#data['ans_tokenized'] = data['ans_tokenized'] + data['tags']

In [379]:
#lemmatizer = WordNetLemmatizer()
#def lemmatize_text(text):
#    return [lemmatizer.lemmatize(word) for word in text]

In [380]:
#data['ans_tokenized'].apply(lemmatize_text)

In [381]:
#function for filtering stop words
def filter_stop_words(words):
    words_filtered = []
    for word in words:
        if word not in stop_words:
            words_filtered.append(word)
    return words_filtered

In [382]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

data['ans_tokenized'] = data['ans_tokenized'].apply(filter_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [383]:
data

Unnamed: 0,id,tags,answers,ans_tokenized
0,62766758,"[python, pandas, shell, numpy, google, cloud, ...",container optimized os cos has several limit...,"[container, optimized, os, cos, several, limit..."
1,62766758,"[python, pandas, shell, numpy, google, cloud, ...",i m pretty sure that in case of python librari...,"[pretty, sure, case, python, libraries, native..."
2,62742938,"[python, modulenotfounderror]",the standard library has a module runpy for th...,"[standard, library, module, runpy, purpose, ma..."
3,62741826,"[python, pandas, performance, data, science]",hello and welcome to stackoverflow \nin pandas...,"[hello, welcome, stackoverflow, pandas, rule, ..."
4,62741826,"[python, pandas, performance, data, science]",i guess you want to groupby and exclude all th...,"[guess, want, groupby, exclude, elements, appe..."
...,...,...,...,...
99995,35898687,"[python, apache, spark, pyspark, apache, spark...",just for fun non udf solution \nfrom pyspark s...,"[fun, non, udf, solution, pyspark, sql, functi..."
99996,35898687,"[python, apache, spark, pyspark, apache, spark...",the following code does the requested task an...,"[following, code, requested, task, user, defin..."
99997,35937882,"[python, python, user, interface, tkinter]",at the end of your code just add window mainl...,"[end, code, add, window, mainloop, tells, tkin..."
99998,35937882,"[python, python, user, interface, tkinter]",you didn t do a \nwindow mainloop \r\n\nat t...,"[window, mainloop, end, check, tkinter, docs, ..."


In [388]:
count = Counter()
def count_tag(answer):
    for word in answer:
        count[word] += 1

In [389]:
data['ans_tokenized'].apply(count_tag)
len(count.values())

111816

In [341]:
word_list = []
for word, cou in count.items():
    if(cou>1000):
        word_list.append(word)

In [342]:
word_list

['container',
 'os',
 'several',
 'could',
 'reason',
 'program',
 'work',
 'system',
 'running',
 'many',
 'linux',
 'quite',
 'different',
 'support',
 'common',
 'features',
 'may',
 'include',
 'package',
 'manager',
 'install',
 'packages',
 'directly',
 'instance',
 'non',
 'third',
 'modules',
 'root',
 'always',
 'read',
 'build',
 'time',
 'default',
 'see',
 'details',
 'additional',
 'information',
 'pretty',
 'sure',
 'case',
 'python',
 'libraries',
 'message',
 'map',
 'object',
 'means',
 'file',
 'l',
 'e',
 'g',
 'r',
 'instead',
 'x',
 'way',
 'think',
 'pip',
 'flag',
 'setup',
 'check',
 'using',
 'command',
 'inside',
 'set',
 'remove',
 'another',
 'question',
 'standard',
 'library',
 'module',
 'make',
 'script',
 'containing',
 'import',
 'true',
 'equivalent',
 'sys',
 'path',
 'either',
 'via',
 'directory',
 'hello',
 'pandas',
 'raw',
 'loops',
 'functions',
 'apply',
 'function',
 'sub',
 'dataframe',
 'rows',
 'use',
 'groupby',
 'bit',
 'side',
 'example

Some of the words are not useful, which are therefore added to the list of 'stop_words' and again the answer text is filtered.

In [384]:
stop_words.update(['several', 'could', 'many', 'may', 'non', 'quite', 'v', 'l', 'e', 'g', 'rw', 'r', 'xr', 'x', 'rwxr', 'think',
                   'sane', 'hello', 'welcome', 'still', 'normal', 'much', 'c', 'p', 'n', 'k', 'b', 'h', 'lot', 'us', 'kind', 
                   'q', 'w', 'z', 'en', 'ax', 'j', 'python', 'code', 'using', 'need', 'get', 'py', 'u', 'keep', 'general', 
                   'also', 'really', 'normal', 'able', 'maybe', 'things', 'everything', 'happens', 'actually', 'lst', 'already',
                  'within', 'li', 'python', 'based', 'user', 'np', 'pd', 'tf', 'id'])

In [385]:
data['ans_tokenized'] = data['ans_tokenized'].apply(filter_stop_words)

In [386]:
answers = data['ans_tokenized']

In [387]:
answers

0        [container, optimized, os, cos, limitations, r...
1        [pretty, sure, case, libraries, natives, messa...
2        [standard, library, module, runpy, purpose, ma...
3        [stackoverflow, pandas, rule, thumb, raw, loop...
4        [guess, want, groupby, exclude, elements, appe...
                               ...                        
99995    [fun, udf, solution, pyspark, sql, functions, ...
99996    [following, requested, task, defined, function...
99997    [end, add, window, mainloop, tells, tkinter, f...
99998    [window, mainloop, end, check, tkinter, docs, ...
99999    [whenever, faced, complex, xml, consider, xslt...
Name: ans_tokenized, Length: 100000, dtype: object

# Building the model

In [390]:
#Create dictionary and corpus
id2word = corpora.Dictionary(answers)
corpus = [id2word.doc2bow(answer) for answer in answers]

In [None]:
#Building an lda model
ldamodel = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, passes=5)

In [None]:
pprint(ldamodel.show_topics(formatted=False))

In [None]:
ldamodel.log_perplexity(corpus)

In [None]:
coherence_lda = CoherenceModel(model=ldamodel, texts=answers, dictionary=id2word, coherence='c_v')
score = coherence_lda.get_coherence()

In [None]:
print("Coherence score of lda model: ",score)

In [348]:
#Building LDA Mallet model
import os
from gensim.models.wrappers import LdaMallet
os.environ['MALLET_HOME'] = 'C:\\Users\\BOBBY\\Downloads\\mallet-2.0.8'
mallet_path = 'C:\\Users\\BOBBY\\Downloads\\mallet-2.0.8\\bin\\mallet'

In [395]:
ldamallet_model = LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)

In [396]:
pprint(ldamallet_model.show_topics(formatted=False))

[(0,
  [('def', 0.04859744939563604),
   ('return', 0.04838610253492947),
   ('function', 0.037508784103898116),
   ('class', 0.02836627115449984),
   ('object', 0.024669462315974125),
   ('method', 0.01960066010669494),
   ('call', 0.015262765790692636),
   ('test', 0.013760441855836783),
   ('type', 0.013707605140660143),
   ('variable', 0.011470850864848966)]),
 (1,
  [('print', 0.057633383173792484),
   ('list', 0.05469433754151231),
   ('string', 0.027473910831821848),
   ('key', 0.023729881838719016),
   ('append', 0.020684646912032447),
   ('input', 0.017896321568587153),
   ('number', 0.0178860451852575),
   ('len', 0.017397916977099078),
   ('split', 0.016043147108140095),
   ('item', 0.0160106052275962)]),
 (2,
  [('time', 0.02124062315766648),
   ('problem', 0.00985119027451193),
   ('process', 0.008411205453646042),
   ('work', 0.008101185192212562),
   ('run', 0.007711542240684145),
   ('question', 0.007005102628565233),
   ('answer', 0.0069898557304619476),
   ('start', 0

In [397]:
coherence_ldamallet = CoherenceModel(model=ldamallet_model, texts=answers, dictionary=id2word, coherence='c_v')
score = coherence_ldamallet.get_coherence()

In [398]:
print("Coherence score of lda mallet model: ",score)

Coherence score of lda mallet model:  0.5901525301797624


By comparing the scores, it is clearly evident that LDA Mallet model works better that LDA multicore model. We can use LDA mallet model and find optimal number of topics.

# Finding optimal number of topics

In [309]:
def coh_value(id2word, corpus, answers, start, stop, step):
    coh_values_list = []
    model_list = []
    for num_topics in range(start, stop, step):
        model = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=answers, dictionary=id2word, coherence='c_v')
        coh_values_list.append(coherencemodel.get_coherence())
        print(num_topics)
    return model_list, coh_values_list

In [310]:
start = 2
stop = 20
step = 2

In [311]:
model_list, coh_values_list = coh_value(id2word, corpus, answers, start, stop, step)

2
4
6
8
10
12
14
16
18


In [None]:
topic_num = range(start, stop, step)
plt.plot(topic_num, coh_values_list)
plt.title("Coherence value score for the number of topics")
plt.xlabel("No. of Topics")
plt.ylabel("Coherence score")
#plt.legend(("coherence_values"), loc='best')
plt.show()

In [315]:
for i in range(start, stop, step):
    print("No. of topics:  ", i, ", Coherence Value: ", coh_values_list[(i-start)//2])

No. of topics:   2 , Coherence Value:  0.4095438810466185
No. of topics:   4 , Coherence Value:  0.5342842390038446
No. of topics:   6 , Coherence Value:  0.5682253615521279
No. of topics:   8 , Coherence Value:  0.5837369851009455
No. of topics:   10 , Coherence Value:  0.6068811678520329
No. of topics:   12 , Coherence Value:  0.6149607929334713
No. of topics:   14 , Coherence Value:  0.6138146710755172
No. of topics:   16 , Coherence Value:  0.6174509538255538
No. of topics:   18 , Coherence Value:  0.6161918846495307


In [402]:
final_model = model_list[3]
model_topics = final_model.show_topics(formatted=False)
pprint(final_model.print_topics(num_words=10))

[(0,
  '0.049*"def" + 0.048*"return" + 0.038*"function" + 0.028*"class" + '
  '0.025*"object" + 0.020*"method" + 0.015*"call" + 0.014*"test" + '
  '0.014*"type" + 0.011*"variable"'),
 (1,
  '0.058*"print" + 0.055*"list" + 0.027*"string" + 0.024*"key" + '
  '0.021*"append" + 0.018*"input" + 0.018*"number" + 0.017*"len" + '
  '0.016*"split" + 0.016*"item"'),
 (2,
  '0.021*"time" + 0.010*"problem" + 0.008*"process" + 0.008*"work" + '
  '0.008*"run" + 0.007*"question" + 0.007*"answer" + 0.007*"start" + '
  '0.007*"server" + 0.007*"message"'),
 (3,
  '0.046*"file" + 0.035*"line" + 0.023*"path" + 0.020*"open" + 0.019*"import" '
  '+ 0.017*"os" + 0.014*"write" + 0.013*"read" + 0.013*"install" + '
  '0.013*"csv"'),
 (4,
  '0.025*"text" + 0.023*"import" + 0.023*"def" + 0.012*"root" + 0.010*"add" + '
  '0.010*"match" + 0.010*"set" + 0.009*"end" + 0.009*"event" + 0.008*"true"'),
 (5,
  '0.064*"df" + 0.035*"true" + 0.029*"data" + 0.029*"index" + 0.027*"false" + '
  '0.024*"pd" + 0.020*"values" + 0

In [403]:
topics = final_model[corpus]

In [404]:
topics

[[(0, 0.06276371308016879),
  (1, 0.040260196905766536),
  (2, 0.324367088607595),
  (3, 0.314521800281294),
  (4, 0.05854430379746836),
  (5, 0.039556962025316465),
  (6, 0.07401547116736991),
  (7, 0.08597046413502112)],
 [(0, 0.11531007751937984),
  (1, 0.10109819121447028),
  (2, 0.1605297157622739),
  (3, 0.23417312661498707),
  (4, 0.14114987080103358),
  (5, 0.08301033591731266),
  (6, 0.07396640826873385),
  (7, 0.09076227390180878)],
 [(0, 0.10555555555555556),
  (1, 0.08925925925925926),
  (2, 0.1411111111111111),
  (3, 0.27296296296296296),
  (4, 0.11),
  (5, 0.10703703703703704),
  (6, 0.0862962962962963),
  (7, 0.08777777777777777)],
 [(0, 0.1787749287749288),
  (1, 0.12559354226020894),
  (2, 0.1103988603988604),
  (3, 0.0600664767331434),
  (4, 0.09710351377018044),
  (5, 0.2851377018043685),
  (6, 0.08000949667616335),
  (7, 0.06291547958214624)],
 [(0, 0.09702093397745572),
  (1, 0.12600644122383253),
  (2, 0.10668276972624799),
  (3, 0.09057971014492754),
  (4, 0.0970

In [440]:
topic_df = pd.DataFrame()

In [None]:
for i, topic_vector in enumerate(topics):
    print(i)
    topic_keyword_list = []
    topic_vector = sorted(topic_vector, key = lambda topic: (topic[1]), reverse=True)
    topic_num = topic_vector[0][0]
    prob = topic_vector[0][1]
    prob = round(prob,2) * 100
    main_topic = final_model.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in main_topic])
    print(topic_keywords)
    break
    topic_df = topic_df.append(pd.Series([df.iloc[i]['id'], df.iloc[i]['answers'], topic_keyword_list]), ignore_index=True)          

In [None]:
topic_df

In [None]:
topic_df.to_csv('data/keyword_answer.csv', index=False)