# Importing the required libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None 

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from bs4 import BeautifulSoup
from tqdm import tqdm

tqdm.pandas()

  from pandas import Panel


In [4]:
from collections import Counter
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel

In [5]:
import os
from gensim.models.wrappers import LdaMallet
os.environ['MALLET_HOME'] = 'C:\\Users\\BOBBY\\Downloads\\mallet-2.0.8'
mallet_path = 'C:\\Users\\BOBBY\\Downloads\\mallet-2.0.8\\bin\\mallet'

# Importing the datasets

In [6]:
#The whole dataset is in the data folder in 5 different files
df1 = pd.read_csv('data/original_data1.csv')
df2 = pd.read_csv('data/original_data2.csv')
df3 = pd.read_csv('data/original_data3.csv')
df4 = pd.read_csv('data/original_data4.csv')
df5 = pd.read_csv('data/original_data5.csv')

In [7]:
#All the different data from differnt datasets is concatenated to form a single dataset
df = pd.concat([df1,df2,df3,df4,df5], axis=0)

In [8]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
df.reset_index(drop=True, inplace=True)

# Understanding the dataset

For this model, we want to label the answers, so we will be needing only the 'id', 'tags' and 'answers' columns. Other columns are ignored. The required columns are copied into another dataframe, so the original dataframe can be used when required. 

In [10]:
data = df[['id','tags','answers']]

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
id         100000 non-null int64
tags       100000 non-null object
answers    100000 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [12]:
data.isnull().sum()

id         0
tags       0
answers    0
dtype: int64

In [13]:
min_ans_len = data['answers'].str.len().min()
max_ans_len = data['answers'].str.len().max()

In [14]:
data[data['answers'].str.len()==min_ans_len]

Unnamed: 0,id,tags,answers
43053,4350260,python|inheritance|decorator,"<p>No, it doesn't.</p>"


In [15]:
data[data['answers'].str.len()==max_ans_len]

Unnamed: 0,id,tags,answers
87503,64587303,python|numpy|image-processing|convolution,<p>I've implemented several very fast solution...


# Text Processing

In [16]:
data['tags'] = data['tags'].str.replace('|',' ')
data['tags'] = data['tags'].str.replace('python-3.x','python')
data['tags'] = data['tags'].str.replace('python-2.7','python')

In [17]:
#def parser(text):
#    soup = BeautifulSoup(text, 'html.parser')
#    while(soup.code):
#        soup.code.decompose()
#    return soup.text

In [18]:
#removing html tags
data['answers'] = data['answers'].apply(lambda text: BeautifulSoup(text, "lxml").text)

In [19]:
#converting all letters to lowercase
data['answers'] = data['answers'].str.lower()
data['tags'] = data['tags'].str.lower()

In [20]:
#removing all punctuations
data['answers'] = data['answers'].str.replace('[^\w\s]',' ')
data['tags'] = data['tags'].str.replace('[^\w\s]',' ')

In [21]:
stem_word_list = [('np ', 'numpy '), ('plt', 'plot'), ('correctly', 'correct'), ('containing', 'contain'), ('better', 'good'),
                  ('best', 'good'), ('strings', 'string'), ('arrays', 'array'), ('variables', 'variable'), ('calls', 'call'), 
                  ('called', 'call'), ('calling', 'call'), ('returns', 'return'), ('results', 'result'), ('values', 'value'), 
                  ('val ', 'value '), ('images', 'image'), ('img', 'image'), ('png', 'image'), ('tuples', 'tuple'), ('arguments', 'argument'), 
                  ('args', 'argument'), ('argv', 'argument'), ('parameters', 'parameter'), ('params', 'parameter'), ('param', 'parameter'), 
                  ('prints', 'print'), ('lists', 'list'), ('runs', 'run'), ('models', 'model'), ('headers', 'header'), ('installation', 'install'), 
                  ('installed','install'), ('tf','tensorflow'), ('found', 'find'), ('lines', 'line'), ('pd', 'pandas'), 
                  ('df', 'dataframe'), ('lib ', 'library '), ('https', 'http'), ('works', 'work')]

In [22]:
for (word, repl_word) in stem_word_list:
    data['answers'] = data['answers'].str.replace(word, repl_word)

In [23]:
#tokenizing text
data['ans_tokenized'] = data['answers'].apply(lambda text: [word for word in nltk.word_tokenize(text) if word.isalpha()])

In [24]:
for i in range(0,data.shape[0]):
    data['tags'][i] = data['tags'][i].split(' ')

In [25]:
#data['ans_tokenized'] = data['ans_tokenized'] + data['tags']

In [26]:
#lemmatizer = WordNetLemmatizer()
#def lemmatize_text(text):
#    return [lemmatizer.lemmatize(word) for word in text]

In [27]:
#data['ans_tokenized'].apply(lemmatize_text)

In [28]:
#function for filtering stop words
def filter_stop_words(words):
    words_filtered = []
    for word in words:
        if word not in stop_words:
            words_filtered.append(word)
    return words_filtered

In [29]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

data['ans_tokenized'] = data['ans_tokenized'].apply(filter_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BOBBY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
data

Unnamed: 0,id,tags,answers,ans_tokenized
0,62766758,"[python, pandas, shell, numpy, google, cloud, ...",container optimized os cos has several limit...,"[container, optimized, os, cos, several, limit..."
1,62766758,"[python, pandas, shell, numpy, google, cloud, ...",i m pretty sure that in case of python librari...,"[pretty, sure, case, python, libraries, native..."
2,62742938,"[python, modulenotfounderror]",the standard library has a module runpy for th...,"[standard, library, module, runpy, purpose, ma..."
3,62741826,"[python, pandas, performance, data, science]",hello and welcome to stackoverflow \nin pandas...,"[hello, welcome, stackoverflow, pandas, rule, ..."
4,62741826,"[python, pandas, performance, data, science]",i guess you want to groupby and exclude all th...,"[guess, want, groupby, exclude, elements, appe..."
...,...,...,...,...
99995,35898687,"[python, apache, spark, pyspark, apache, spark...",just for fun non udataframe solution \nfrom py...,"[fun, non, udataframe, solution, pyspark, sql,..."
99996,35898687,"[python, apache, spark, pyspark, apache, spark...",the following code does the requested task an...,"[following, code, requested, task, user, defin..."
99997,35937882,"[python, python, user, interface, tkinter]",at the end of your code just add window mainl...,"[end, code, add, window, mainloop, tells, tkin..."
99998,35937882,"[python, python, user, interface, tkinter]",you didn t do a \nwindow mainloop \r\n\nat t...,"[window, mainloop, end, check, tkinter, docs, ..."


In [31]:
count = Counter()
def count_tag(answer):
    for word in answer:
        count[word] += 1

In [32]:
data['ans_tokenized'].apply(count_tag)
len(count.values())

111579

In [33]:
word_list = []
for word, cou in count.items():
    if(cou>1000):
        word_list.append(word)

Some of the words are not useful, which are therefore added to the list of 'stop_words' and again the answer text is filtered.

In [34]:
stop_words.update(['several', 'could', 'many', 'may', 'non', 'quite', 'v', 'l', 'e', 'g', 'rw', 'r', 'xr', 'x', 'rwxr', 'think',
                   'sane', 'hello', 'welcome', 'still', 'normal', 'much', 'c', 'p', 'n', 'k', 'b', 'h', 'lot', 'us', 'kind', 
                   'q', 'w', 'z', 'en', 'ax', 'j', 'python', 'code', 'using', 'need', 'get', 'py', 'u', 'keep', 'general', 
                   'also', 'really', 'normal', 'able', 'maybe', 'things', 'everything', 'happens', 'actually', 'lst', 'already',
                  'within', 'li', 'python', 'based', 'user', 'np', 'pd', 'tf', 'id', 'work', 'write', 'df', 'np', 'pd', 'make',
                  'said'])

In [35]:
data['ans_tokenized'] = data['ans_tokenized'].apply(filter_stop_words)

In [36]:
answers = data['ans_tokenized']

In [37]:
answers

0        [container, optimized, os, cos, limitations, r...
1        [pretty, sure, case, libraries, natives, messa...
2        [standard, library, module, runpy, purpose, sc...
3        [stackoverflow, pandas, rule, thumb, raw, loop...
4        [guess, want, groupby, exclude, elements, appe...
                               ...                        
99995    [fun, udataframe, solution, pyspark, sql, func...
99996    [following, requested, task, defined, function...
99997    [end, add, window, mainloop, tells, tkinter, f...
99998    [window, mainloop, end, check, tkinter, docs, ...
99999    [whenever, faced, complex, xml, consider, xslt...
Name: ans_tokenized, Length: 100000, dtype: object

# Building the model

In [38]:
#Create dictionary and corpus
id2word = corpora.Dictionary(answers)
corpus = [id2word.doc2bow(answer) for answer in answers]

In [41]:
#Building an lda model
ldamodel = LdaMulticore(corpus=corpus, num_topics=8, id2word=id2word, passes=5)

In [42]:
pprint(ldamodel.show_topics(formatted=False))

[(0,
  [('image', 0.009205409),
   ('word', 0.007546723),
   ('use', 0.00724322),
   ('data', 0.006802537),
   ('print', 0.006562065),
   ('one', 0.0057311),
   ('number', 0.005531396),
   ('result', 0.0053765476),
   ('words', 0.0051837894),
   ('import', 0.005171616)]),
 (1,
  [('self', 0.0999506),
   ('def', 0.03219435),
   ('class', 0.0297118),
   ('return', 0.016687993),
   ('object', 0.011091377),
   ('none', 0.010847483),
   ('method', 0.008263498),
   ('print', 0.008164265),
   ('type', 0.00771539),
   ('import', 0.0074060527)]),
 (2,
  [('self', 0.017499989),
   ('import', 0.015122021),
   ('plt', 0.012790157),
   ('text', 0.012280679),
   ('def', 0.0104331635),
   ('root', 0.008935197),
   ('data', 0.008158805),
   ('tk', 0.0074657574),
   ('use', 0.0069882334),
   ('matplotlib', 0.0069820853)]),
 (3,
  [('print', 0.02017311),
   ('list', 0.018562352),
   ('use', 0.01204622),
   ('return', 0.011777863),
   ('string', 0.011535755),
   ('key', 0.011410094),
   ('function', 0.01

In [43]:
ldamodel.log_perplexity(corpus)

-7.989066439690989

In [44]:
coherence_lda = CoherenceModel(model=ldamodel, texts=answers, dictionary=id2word, coherence='c_v')
score = coherence_lda.get_coherence()

In [45]:
print("Coherence score of lda model: ",score)

Coherence score of lda model:  0.5019330426989694


In [46]:
ldamallet_model = LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)

In [47]:
pprint(ldamallet_model.show_topics(formatted=False))

[(0,
  [('key', 0.022075654432381596),
   ('good', 0.01712811585471575),
   ('case', 0.012001940485927098),
   ('dict', 0.011210334313500563),
   ('set', 0.010418728141074028),
   ('answer', 0.010056200535560104),
   ('question', 0.009112227688352357),
   ('keys', 0.008143736065892457),
   ('problem', 0.007522009979141528),
   ('order', 0.007490485839531621)]),
 (1,
  [('file', 0.048101612643450475),
   ('path', 0.024207238845620538),
   ('open', 0.020702780055871402),
   ('install', 0.019708646738335594),
   ('os', 0.01783515899653627),
   ('line', 0.01534334624904428),
   ('run', 0.01382530282189463),
   ('version', 0.012877451316162164),
   ('library', 0.012518304456568222),
   ('read', 0.012434997195322204)]),
 (2,
  [('import', 0.02876977793520725),
   ('image', 0.02782039585731673),
   ('text', 0.023238237238430822),
   ('def', 0.02179871864310442),
   ('csv', 0.01385562390465815),
   ('row', 0.013563189511902286),
   ('root', 0.011911553011971275),
   ('event', 0.009539127515811

In [48]:
coherence_ldamallet = CoherenceModel(model=ldamallet_model, texts=answers, dictionary=id2word, coherence='c_v')
score = coherence_ldamallet.get_coherence()

In [49]:
print("Coherence score of lda mallet model: ",score)

Coherence score of lda mallet model:  0.5616661274153613


By comparing the scores, it is clearly evident that LDA Mallet model works better that LDA multicore model. We can use LDA mallet model and find optimal number of topics.

# Finding optimal number of topics

In [309]:
def coh_value(id2word, corpus, answers, start, stop, step):
    coh_values_list = []
    model_list = []
    for num_topics in range(start, stop, step):
        model = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=answers, dictionary=id2word, coherence='c_v')
        coh_values_list.append(coherencemodel.get_coherence())
    return model_list, coh_values_list

In [310]:
start = 2
stop = 20
step = 2

In [311]:
model_list, coh_values_list = coh_value(id2word, corpus, answers, start, stop, step)

2
4
6
8
10
12
14
16
18


In [None]:
topic_num = range(start, stop, step)
plt.plot(topic_num, coh_values_list)
plt.title("Coherence value score for the number of topics")
plt.xlabel("No. of Topics")
plt.ylabel("Coherence score")
#plt.legend(("coherence_values"), loc='best')
plt.show()

In [315]:
for i in range(start, stop, step):
    print("No. of topics:  ", i, ", Coherence Value: ", coh_values_list[(i-start)//2])

No. of topics:   2 , Coherence Value:  0.4095438810466185
No. of topics:   4 , Coherence Value:  0.5342842390038446
No. of topics:   6 , Coherence Value:  0.5682253615521279
No. of topics:   8 , Coherence Value:  0.5837369851009455
No. of topics:   10 , Coherence Value:  0.6068811678520329
No. of topics:   12 , Coherence Value:  0.6149607929334713
No. of topics:   14 , Coherence Value:  0.6138146710755172
No. of topics:   16 , Coherence Value:  0.6174509538255538
No. of topics:   18 , Coherence Value:  0.6161918846495307


In [50]:
final_model = final_model[3]
model_topics = final_model.show_topics(formatted=False)
pprint(final_model.print_topics(num_words=10))

[(0,
  '0.022*"key" + 0.017*"good" + 0.012*"case" + 0.011*"dict" + 0.010*"set" + '
  '0.010*"answer" + 0.009*"question" + 0.008*"keys" + 0.008*"problem" + '
  '0.007*"order"'),
 (1,
  '0.048*"file" + 0.024*"path" + 0.021*"open" + 0.020*"install" + 0.018*"os" + '
  '0.015*"line" + 0.014*"run" + 0.013*"version" + 0.013*"library" + '
  '0.012*"read"'),
 (2,
  '0.029*"import" + 0.028*"image" + 0.023*"text" + 0.022*"def" + 0.014*"csv" + '
  '0.014*"row" + 0.012*"root" + 0.010*"event" + 0.009*"add" + 0.009*"label"'),
 (3,
  '0.032*"time" + 0.030*"import" + 0.026*"true" + 0.023*"false" + 0.022*"data" '
  '+ 0.022*"plot" + 0.017*"print" + 0.016*"start" + 0.010*"process" + '
  '0.010*"result"'),
 (4,
  '0.065*"print" + 0.061*"list" + 0.036*"string" + 0.030*"line" + '
  '0.020*"append" + 0.019*"input" + 0.018*"number" + 0.018*"len" + '
  '0.018*"result" + 0.017*"int"'),
 (5,
  '0.056*"return" + 0.048*"def" + 0.038*"function" + 0.034*"class" + '
  '0.027*"call" + 0.025*"object" + 0.025*"argument"

# Creating answer dataset with topics

In [70]:
topics = final_model[corpus]

In [71]:
topics

[[(0, 0.21886058032554848),
  (1, 0.4205590941259731),
  (2, 0.0589171974522293),
  (3, 0.09288747346072186),
  (4, 0.03980891719745223),
  (5, 0.060332625619249826),
  (6, 0.040516631280962494),
  (7, 0.0681174805378627)],
 [(0, 0.13727390180878551),
  (1, 0.23029715762273897),
  (2, 0.09980620155038757),
  (3, 0.16182170542635657),
  (4, 0.09463824289405683),
  (5, 0.12435400516795862),
  (6, 0.07655038759689921),
  (7, 0.07525839793281652)],
 [(0, 0.15653153153153154),
  (1, 0.27214714714714716),
  (2, 0.09496996996996997),
  (3, 0.11599099099099099),
  (4, 0.08596096096096097),
  (5, 0.09196696696696696),
  (6, 0.09196696696696696),
  (7, 0.09046546546546547)],
 [(0, 0.1579861111111111),
  (1, 0.06969246031746032),
  (2, 0.08060515873015874),
  (3, 0.06274801587301587),
  (4, 0.1212797619047619),
  (5, 0.16493055555555555),
  (6, 0.2839781746031746),
  (7, 0.058779761904761904)],
 [(0, 0.12295751633986929),
  (1, 0.09354575163398694),
  (2, 0.09191176470588237),
  (3, 0.10008169934

In [72]:
topic_df = pd.DataFrame()

In [77]:
for i, topic_vector in enumerate(topics):
    topic_list = []
    topic_vector = sorted(topic_vector, key = lambda topic: (topic[1]), reverse=True)
    for j, (topic_num, prob) in enumerate(topic_vector):
        prob = round(prob,2)*100
        if(prob > 14):
            topic = "Topic " + str(j)
            topic_list.append(topic)
        else:
            break
    topics_col = ", ".join([topic for topic in topic_list])
    if(i%1000==0): print(i)
    topic_df = topic_df.append(pd.Series([df.iloc[i]['id'], df.iloc[i]['answers'], topics_col]), ignore_index=True) 

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [78]:
topic_df

Unnamed: 0,0,1,2
0,62766758.0,<p>Container-Optimized OS (COS) has several li...,"Topic 0, Topic 1"
1,62766758.0,<p>Container-Optimized OS (COS) has several li...,"Topic 0, Topic 1"
2,62766758.0,<p>I'm pretty sure that in case of Python libr...,"Topic 0, Topic 1, Topic 2"
3,62742938.0,"<p>The standard library has a module <a href=""...","Topic 0, Topic 1"
4,62741826.0,<p>Hello and welcome to StackOverflow.</p>\r\n...,"Topic 0, Topic 1, Topic 2"
...,...,...,...
105110,35898687.0,<p>Just for fun non-UDF solution:</p>\r\n\r\n<...,"Topic 0, Topic 1"
105111,35898687.0,<p>The following code does the requested task....,"Topic 0, Topic 1, Topic 2"
105112,35937882.0,"<p>At the end of your code, just add <code>win...","Topic 0, Topic 1"
105113,35937882.0,<p>You didn't do a </p>\r\n\r\n<pre><code>wind...,"Topic 0, Topic 1"


In [79]:
topic_df.columns = ['id', 'answer', 'topic list']

In [80]:
topic_df.to_csv('data/keyword_answer.csv', index=False)

# Creating dataset for topics and its keywords

In [81]:
keyword_df = pd.DataFrame()

In [82]:
for topic_num in range(0,8):
    topic_keyword_list = final_model.show_topic(topic_num)
    keyword_list = []
    for word in topic_keyword_list:
        keyword_list.append(word[0])
    keyword_df = keyword_df.append(pd.Series(["Topic "+str(topic_num), keyword_list]), ignore_index=True)

In [83]:
keyword_df.columns = ['Topic', 'Keywords']

In [84]:
keyword_df

Unnamed: 0,Topic,Keywords
0,Topic 0,"[key, good, case, dict, set, answer, question,..."
1,Topic 1,"[file, path, open, install, os, line, run, ver..."
2,Topic 2,"[import, image, text, def, csv, row, root, eve..."
3,Topic 3,"[time, import, true, false, data, plot, print,..."
4,Topic 4,"[print, list, string, line, append, input, num..."
5,Topic 5,"[return, def, function, class, call, object, a..."
6,Topic 6,"[dataframe, numpy, pandas, array, index, data,..."
7,Topic 7,"[http, data, html, import, request, url, json,..."


In [85]:
keyword_df.to_csv('data/keywords.csv', index=False)