In [1]:
%load_ext autoreload
%autoreload 2

## Initialization

In [None]:
import os
import string
import pandas as pd
import numpy as np
import gensim
import unicodedata
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import json
import html
import warnings
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
from openpyxl import load_workbook
warnings.filterwarnings('ignore')

In [14]:
def convert(o):
    if isinstance(o, np.int64): return int(o)
    raise TypeError

def preprocess_text(text):
    # remove non ascii
    text = unicodedata.normalize('NFKD', text).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')
    text = text.lower()
    # unescaping
    text = html.unescape(text)
    text = re.sub(r"```[^\S\r\n]*[a-z]*\n.*?\n```", '', text, 0, re.DOTALL) # removing code segment
    text = re.sub(r'<code>(.|\n)*?</code>','', text) # removing <code>...</code>
    text = re.sub(r'<a.*?</a>', '', text)  # removing whole anchor tags
    text = re.sub(r'(<.*?>)', '', text)  # removing html markup
    text = re.sub(r'[^\w\s]', '', text)  # removing punctuation
    text = re.sub('_', ' ', text)  # updating hyphen with space
    text = re.sub(r'[\d]', '', text)  # removing digits
    # remove stopwords
    tokenized = []
    for word in text.split():
        if word in stop_words_set:
            continue
        tokenized.append(word)
    for i in range(len(tokenized)):
        word = tokenized[i]
        word = WordNetLemmatizer().lemmatize(word, pos='v')
        tokenized[i] = stemmer.stem(word)
        # tokenized[i] = word
    return tokenized

def create_dir(parent_dir, dir_name):
    temp = os.path.join(parent_dir,dir_name)
    try:
        os.makedirs(temp)
    except OSError as error:
        print(error)
        # pass
    return temp

In [15]:
stemmer = PorterStemmer()
pyLDAvis.enable_notebook()
os.environ.update({'MALLET_HOME':'/home/ajoy.das/bin/Mallet'})
mallet_path = '/home/ajoy.das/bin/Mallet/bin/mallet'
TOPIC_DIR = '/tmp/issues'
RES_DIR = create_dir(TOPIC_DIR, 'Topic Modeling Results')
CURR_DIR = os.getcwd()
coherence_scores = []

In [16]:
# stop words set
STOP_WORDS_FILES = [CURR_DIR + '/mallet_stop_words.txt', CURR_DIR + '/custom_stop_words.txt']
stop_words_set = set()
stop_words = set(stopwords.words('english'))
for word in stop_words:
    if('\'' in word):
        stop_words_set.add(word.strip().replace('\'', ''))
    stop_words_set.add(word)
for swfile in STOP_WORDS_FILES:
    try:
        with open(swfile, 'r') as f:
            words = f.readlines()
            for word in words:
                if('\'' in word):
                    stop_words_set.add(word.strip().replace('\'', ''))
                stop_words_set.add(word.strip())
    except:
        print('STOP_WORDS_FILES not found.')
        # pass

## Preprocessing

In [17]:
!pwd

/home/ajoy.das/projects/bias_study_updated/replica_package_generate/replication_package/challenges


In [18]:
file_issue_dis = f'issue_discussions_metadata_2540_discussions.csv'
df_issue_dis = pd.read_csv(file_issue_dis)
df = df_issue_dis
df['raw'] = df['text']
df['preprocessed'] = ""
for i in range(len(df)):
    df['preprocessed'].iloc[i] = preprocess_text(df.raw.iloc[i])
# df.to_csv('preprocesseedData.csv')
df.head()

Unnamed: 0,repo_url,type,id,text,created_at,url,raw,preprocessed
0,https://github.com/Trusted-AI/AIF360,issue,1106204177,Debiasing: # of layers and predicted probabili...,2022-01-17 19:41:27,https://github.com/Trusted-AI/AIF360/issues/287,Debiasing: # of layers and predicted probabili...,"[debias, layer, predict, probabl, question, ad..."
1,https://github.com/Trusted-AI/AIF360,issue,1098675380,Query regarding debiased model saving in Adver...,2022-01-11 05:40:35,https://github.com/Trusted-AI/AIF360/issues/286,Query regarding debiased model saving in Adver...,"[queri, debias, model, save, adversari, debias..."
2,https://github.com/Trusted-AI/AIF360,issue,1098612148,"Query regarding COMPAS dataset Hi, \r\nI am wo...",2022-01-11 03:32:35,https://github.com/Trusted-AI/AIF360/issues/285,"Query regarding COMPAS dataset Hi, \r\nI am wo...","[queri, compa, dataset, work, project, relat, ..."
3,https://github.com/Trusted-AI/AIF360,issue,1097184512,Pytorch Why no pytorch?,2022-01-09 12:35:09,https://github.com/Trusted-AI/AIF360/issues/284,Pytorch Why no pytorch?,"[pytorch, pytorch]"
4,https://github.com/Trusted-AI/AIF360,issue,1092070135,ValueError: at least one array or dtype is req...,2022-01-02 18:03:59,https://github.com/Trusted-AI/AIF360/issues/283,ValueError: at least one array or dtype is req...,"[valueerror, array, dtype, requir, face, value..."


## Finding out optimum topic number

In [None]:
data = df['preprocessed']
dictionary = gensim.corpora.Dictionary(data)
corpus = [dictionary.doc2bow(doc) for doc in data]
coherence_scores = []
for num_topics in tqdm(range(5,51)):
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=50/num_topics)
    coherenceModel = CoherenceModel(model=ldamallet, texts=data, dictionary=dictionary, coherence='c_v')
    score = coherenceModel.get_coherence()
    coherence_scores.append([num_topics,score])
# save scores as csv
ch_df = pd.DataFrame(coherence_scores,columns=['Num Topic','Score'])
ch_df.to_csv(f'{RES_DIR}/TopicModeling_Coherence_Scores-code-5-50-filtered-issues.csv')
# plot
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
x = []
y = []
for score in coherence_scores:
    x.append(score[0])
    y.append(score[1])
plt.plot(x,y,c='r')
plt.gca().set_aspect('auto')
plt.grid()
plt.savefig(f'{RES_DIR}/scores-code-5-50-filtered-issues.png', dpi=500)
plt.show()

In [29]:
# !ls '/tmp/issues/Topic Modeling Results/10 Topics'
# !cat '/tmp/issues/Topic Modeling Results/TopicModeling_Coherence_Scores-code-5-50-filtered-issues.csv'

## Saving found topics

In [None]:
data = df['preprocessed']
dictionary = gensim.corpora.Dictionary(data)
corpus = [dictionary.doc2bow(doc) for doc in data]
# create folder for topic number
NUM_TOPIS = [10, 20, 25, 33]
for num_topics in NUM_TOPIS:
    topic_dir = create_dir(RES_DIR, f'{num_topics} Topics')
    if os.path.isfile(os.path.join(topic_dir, 'ldamallet.pkl')):
        ldamallet = pickle.load(
            open(os.path.join(topic_dir, 'ldamallet.pkl'), "rb"))
    else:
        ldamallet = gensim.models.wrappers.LdaMallet(
            mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=50 / num_topics)
        # save the model as pickle
        pickle.dump(ldamallet, open(os.path.join(
            topic_dir, 'ldamallet.pkl'), "wb"))
    topics = [[(word, word_prob) for word, word_prob in ldamallet.show_topic(
        n, topn=30)] for n in range(ldamallet.num_topics)]
    # term-topic matrix
    topics_df = pd.DataFrame([[f'{word} {round(word_prob, 4)}' for word, word_prob in topic] for topic in topics],
                             columns=[
                                 f'Term {i}' for i in range(1, 31)],
                             index=[f'Topic {n}' for n in range(ldamallet.num_topics)]).T
    topics_df.to_csv(os.path.join(topic_dir, 'term x topic.csv'))
    # topic words
    topic_words_dir = create_dir(topic_dir, 'TopicWords')
    for n in range(num_topics):
        topic_words_df = pd.DataFrame(
            [[word_prob, word] for word, word_prob in topics[n]], columns=['Prob', 'Word'])
        topic_words_df.to_csv(os.path.join(topic_words_dir, f'{n}.csv'))
    # post to dominant topic
    corpus_topic_df = pd.DataFrame()
    corpus_topic_df['repo_url'] = df['repo_url']
    corpus_topic_df['id'] = df['id']
    corpus_topic_df['type'] = df['type']
    corpus_topic_df['url'] = df['url']
    corpus_topic_df['text'] = df['text']
    # for i in range(len(corpus_topic_df)):
    #     corpus_topic_df.link.iloc[i] = make_link(df.id.iloc[i],df.qa.iloc[i])
    topic_model_results = ldamallet[corpus]
    corpus_topics = [sorted(doc, key=lambda x: -x[1])[0]
                     for doc in topic_model_results]
    corpus_topic_df['Dominant Topic'] = [item[0] for item in corpus_topics]
    corpus_topic_df['Correlation'] = [item[1] for item in corpus_topics]
    corpus_topic_df.to_csv(os.path.join(topic_dir, 'postToTopic.csv'))
    topic_to_post_dir = create_dir(topic_dir, 'TopicToPost')

    for i in range(num_topics):
        temp = create_dir(topic_to_post_dir, str(i))
        temp_q_df = corpus_topic_df.loc[corpus_topic_df['Dominant Topic'] == i]
        temp_q_df.drop(columns=['Dominant Topic']).to_csv(
            os.path.join(temp, 'Comments.csv'), index=False)

        topic_comments_xlsx_path = os.path.join(topic_to_post_dir, 'Comments.xlsx')
        try:
            # book = load_workbook(topic_comments_xlsx_path)
            writer = pd.ExcelWriter(topic_comments_xlsx_path, engine='openpyxl', mode='a')
            # writer.book = book
            # writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
        except:
            writer = pd.ExcelWriter(topic_comments_xlsx_path, engine='openpyxl')
            pass
        temp_q_df.drop(columns=['Dominant Topic']).to_excel(
            writer, sheet_name='topic_' + str(i), index=False)
        writer.save()
    # post count under any topic
    topic_post_cnt_df = corpus_topic_df.groupby('Dominant Topic').agg(
        Document_Count=('Dominant Topic', np.size),
        Percentage=('Dominant Topic', np.size)).reset_index()
    topic_post_cnt_df['Percentage'] = topic_post_cnt_df['Percentage'].apply(
        lambda x: round((x * 100) / len(corpus), 2))
    topic_post_cnt_df.to_csv(os.path.join(topic_dir, 'postPerTopic.csv'))
    # pyLDAvis
    vis = pyLDAvis.gensim.prepare(
        gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet), corpus, dictionary)
    pyLDAvis.save_html(vis, os.path.join(topic_dir, f'pyLDAvis-{num_topics}.html'))
