# Load cleaned data

In [1]:
import os, sys
path = os.path.join(os.path.abspath(""), 'internship-env', 'Lib', 'site-packages')
sys.path.append(path)

import glob
import json
import string
import math
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.model_selection import train_test_split
from collections import Counter
import random
random.seed(42)

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import pywsd
from pywsd.utils import lemmatize_sentence
from wordcloud import WordCloud, STOPWORDS

# LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# LDA evaluation
import tmtoolkit
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

#pyLDAvis
import pyLDAvis
import pyLDAvis.lda_model
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pyLDAvis.enable_notebook()

#NLTK packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.probability import MLEProbDist
from nltk.stem import WordNetLemmatizer

Warming up PyWSD (takes ~10 secs)... took 4.666340351104736 secs.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Define stop words
stop_words = set(stopwords.words("english"))

In [3]:
# ========================= Columns =========================
commit_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Sha', 'Message']
code_file_columns = ['Type', 'URL', 'RepoName']
repo_columns = ['Type', 'URL', 'RepoName', 'RepoLanguage']
issue_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'AuthorAt', 'ClosedAt', 'UpdatedAt', 'State']
pull_request_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt', 'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount', 'CommitSha']
hacker_news_columns = ['Type', 'ID', 'URL', 'AttachedURL', 'Title', 'CreatedAt']
discussion_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'AuthorAt', 'ClosedAt', 'UpdatedAt', 'Closed', 'UpvoteCount']
mention_columns = ['MentionedURL', 'MentionedProperty', 'MentionedAuthor', 'MentionedText', 'MentionedPath','MentionedAnswer', 'MentionedUpvoteCount']
gpt_sharing_columns = ['SharingURL', 'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model', 'Conversations']

# ========================= Processing functions =========================
def process_commit_json(commit):
    commit_array_of_elements = [commit[col] for col in (commit_columns + gpt_sharing_columns + mention_columns)]
    return commit_array_of_elements

def process_code_files_json(code_file):
    code_files_array_of_elements = [code_file[col] for col in (code_file_columns + gpt_sharing_columns + mention_columns)]
    return code_files_array_of_elements

def process_repo_json(repo):
    repo_array_of_elements = [repo[col] for col in (repo_columns + gpt_sharing_columns + mention_columns)]
    return repo_array_of_elements

def process_issue_json(issue):
    issue_array_of_elements = [issue[col] for col in (issue_columns + gpt_sharing_columns + mention_columns)]
    return issue_array_of_elements

def process_pull_request_json(pull_request):
    pull_request_array_of_elements = [pull_request[col] for col in (pull_request_columns + gpt_sharing_columns + mention_columns)]
    return pull_request_array_of_elements

def process_hacker_news_json(hacker_news):
    hacker_news_array_of_elements = [hacker_news[col] for col in (hacker_news_columns + gpt_sharing_columns + mention_columns)]
    return hacker_news_array_of_elements

def process_discussion_json(discussion):
    discussion_array_of_elements = [discussion[col] for col in (discussion_columns + gpt_sharing_columns + mention_columns)]
    return discussion_array_of_elements

In [4]:
def read_json_data_from_files_to_dataframe(json_filepath):
    file_sharings_df = pd.DataFrame()
    with open(json_filepath, 'r', encoding='utf-8') as file:
        # Load JSON data from file
        json_data = json.load(file)
        data_to_df = []
        for source in json_data:
            source_array = []
            columns_for_df = []
            if source['Type'] == 'commit':
                source_array = process_commit_json(source)
                columns_for_df = commit_columns
            elif source['Type'] == 'code file':
                source_array = process_code_files_json(source)
                columns_for_df = code_file_columns
            elif source['Type'] == 'repository':
                source_array = process_repo_json(source)
                columns_for_df = repo_columns
            elif source['Type'] == 'issue':
                source_array = process_issue_json(source)
                columns_for_df = issue_columns
            elif source['Type'] == 'pull request':
                source_array = process_pull_request_json(source)
                columns_for_df = pull_request_columns
            elif source['Type'] == 'hacker news':
                source_array = process_hacker_news_json(source)
                columns_for_df = hacker_news_columns
            elif source['Type'] == 'discussion':
                source_array = process_discussion_json(source)
                columns_for_df = discussion_columns
            else:
                print(f"Unexpected type of the course: '{source['Type']}'")
                raise
            data_to_df.append(source_array)
        file_dataframe = pd.DataFrame(data_to_df, columns=(columns_for_df + gpt_sharing_columns + mention_columns))
        file_sharings_df = pd.concat([file_sharings_df, file_dataframe])
    return file_sharings_df

In [5]:
cleaned_dir_name = "cleaned_datasets"
tokenized_dir_name = "tokenized_datasets"
dataframe_names = ["commits", "issues", "discussions", "pull_requests", "code_files", "repository", "hacker_news"]
cleaned_dataframe_file_names = [f"{cleaned_dir_name}/cleaned_{df_name}.json" for df_name in dataframe_names]
tokenized_dataframe_file_names = [f"{tokenized_dir_name}/tokenized_{df_name}.json" for df_name in dataframe_names]

read_tokenised_file = True

In [6]:
def lemmatize_tokenize_all_prompts(prompt):
    tokens = lemmatize_sentence(prompt)
    tokens = [word for word in tokens if word.isascii()]
    tokens = [word for word in tokens if not word.startswith('http')]
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

lemmatised_dataframes = []

if read_tokenised_file and os.path.exists(tokenized_dir_name):
    for df_path in tokenized_dataframe_file_names:
        with open(df_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            lemmatised_dataframes.append(pd.json_normalize(json_data))
else:
    for filename in cleaned_dataframe_file_names:
        dataframe = read_json_data_from_files_to_dataframe(filename)
        conv_prompts = []
        for index, df_row in dataframe.iterrows():
            conversations = df_row["Conversations"]
            for conv in conversations:
                conv_prompts.append(conv["Prompt"])
        prompts_df = pd.DataFrame(conv_prompts, columns=["Prompts"])
        prompts_df["Prompts"] = prompts_df["Prompts"].map(lambda x: lemmatize_tokenize_all_prompts(x))
        lemmatised_dataframes.append(prompts_df)
    if not os.path.exists(tokenized_dir_name):
       os.mkdir(tokenized_dir_name)
    for i, dataframe in enumerate(lemmatised_dataframes):
        dataframe.reset_index(drop=True, inplace=True)
        jso = dataframe.to_json(orient='records')
        with open(tokenized_dataframe_file_names[i], 'w', encoding='utf-8') as file:
            json.dump(json.loads(jso), file)
assert(len(lemmatised_dataframes) == len(dataframe_names))

In [7]:
research_imgs_dir = "research_imgs"

if not os.path.exists(research_imgs_dir):
    os.mkdir(research_imgs_dir)

# N-grams

In [10]:
def prompts_ngrams(prompts, n):
    n_grams = []
    for prompt in prompts:
        for gram in nltk.ngrams(prompt, n):
            token_counts = Counter(gram) # Count each word occurence in n-gram
            # Keep only n-grams that have the same word occur less than 3 times
            if all(count < 3 for count in token_counts.values()): 
                n_grams.append(gram)
    return nltk.FreqDist(n_grams)

def plot_dist_as_cloud(prompts_ngrams, n, axs, axs_i, axs_j, max_words, dataframe_name):
    prob_dist = MLEProbDist(prompts_ngrams)
    visualisation_dict = {}
    for word_freq_tuple in prompts_ngrams:
        string = ' '.join(word_freq_tuple)
        visualisation_dict[string] = prob_dist.prob(word_freq_tuple)
    cloud = WordCloud(width=1000, height=500, max_words=max_words).generate_from_frequencies(visualisation_dict)
    axs[axs_i, axs_j].imshow(cloud, interpolation='bilinear')
    axs[axs_i, axs_j].set_title(f"Wordcloud for {n}-grams with the set max_words argument to {max_words}.")

In [11]:
ns = range(1,5)

for idx, df in enumerate(lemmatised_dataframes):
    fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(30,15), layout="constrained")
    for n in ns:
        print(f"-------- {n}-grams {dataframe_names[idx]} --------")
        i,j =  divmod(n-1, 2)
        all_ngrams_frequencies = prompts_ngrams(df["Prompts"].tolist(), n)
        most_common_ngrams = all_ngrams_frequencies.most_common(25)
        for word_freq_tuple in most_common_ngrams:
            print(word_freq_tuple)
        plot_dist_as_cloud(all_ngrams_frequencies, n, axs, i, j, 50, dataframe_names[idx])
    fig.savefig(f'{research_imgs_dir}/{dataframe_names[idx]}-ngrams.png')   # save the figure to file
    plt.close(fig)
    plt.show()
    print("===========================================================================")

-------- 1-grams commits --------
(('file',), 5173)
(('task',), 3264)
(('need',), 2033)
(('set',), 1908)
(('use',), 1853)
(('solve',), 1654)
(('create',), 1470)
(('script',), 1372)
(('working',), 1207)
(('change',), 980)
(('work',), 946)
(('format',), 924)
(('write',), 899)
(('output',), 829)
(('result',), 797)
(('ai',), 796)
(('junior',), 789)
(('full',), 781)
(('project',), 776)
(('code',), 762)
(('start',), 752)
(('shell',), 687)
(('everything',), 682)
(('div',), 670)
(('new',), 670)
-------- 2-grams commits --------
(('solve', 'task'), 1649)
(('working', 'set'), 1170)
(('need', 'file'), 750)
(('output', 'format'), 702)
(('shell', 'script'), 671)
(('script', 'create'), 663)
(('change', 'file'), 660)
(('full', 'file'), 660)
(('create', 'change'), 658)
(('file', 'everything'), 657)
(('everything', 'solve'), 657)
(('encode', 'enclose'), 646)
(('enclose', 'result'), 646)
(('result', 'shell'), 646)
(('format', 'encode'), 645)
(('task', 'file'), 618)
(('file', 'small'), 606)
(('file', 'ne

# Pattern mining

In [12]:
from sequential.seq2pat import Seq2Pat

dir_name = "seq2pat"

if not os.path.exists(dir_name):
    os.mkdir(dir_name)
dataframe_file_names = [f"{dir_name}/{name}.json" for name in dataframe_names]

for idx, df in enumerate(lemmatised_dataframes):
    if df.empty:
        continue
    sequences = [prompt for prompt in df["Prompts"] if prompt]
    min_frequency = max(1, round(len(df) / 50))
    seq2pat = Seq2Pat(sequences=sequences,
                  max_span=10,
                  batch_size=10000,
                  discount_factor=0.2,
                  n_jobs=os.cpu_count())
    print(f"Minimum frequency for {dataframe_names[idx]} set to {min_frequency}.")
    patterns_entire_set = seq2pat.get_patterns(min_frequency=min_frequency)
    patterns_tuples = [[" ".join(pat[:-1]), pat[-1]] for pat in patterns_entire_set if len(pat) > 5]
    sorted_tuples = sorted(patterns_tuples, key=len, reverse=True)
    with open(dataframe_file_names[idx], 'w') as file:
        json.dump(sorted_tuples, file)
    #print(patterns_entire_set)

Minimum frequency for commits set to 54.
Minimum frequency for issues set to 90.
Minimum frequency for discussions set to 8.
Minimum frequency for pull_requests set to 45.
Minimum frequency for code_files set to 1069.
Minimum frequency for repository set to 3.
Minimum frequency for hacker_news set to 60.


# Topic modelling

In [8]:
def process_reviews_into_dictionary_and_corpus(reviews_ngram_data, n):
    if n > 1:
        reviews_ngram_data = [("_".join(reviews_tuple),) for reviews_tuple in reviews_ngram_data]
        dictionary = corpora.Dictionary(reviews_ngram_data)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in reviews_ngram_data]
    elif n == 1:
        dictionary = corpora.Dictionary(reviews_ngram_data)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in reviews_ngram_data]
    else:
        raise Exception('n should be positive integer')
    return dictionary, doc_term_matrix

def topic_modelling_visualisation(top_n, topics_dicts, data_source_name, enumeration):
    cols = [color for _, color in mcolors.TABLEAU_COLORS.items()]
    cloud = WordCloud(background_color='white',
                  width=2500,
                  height=1800,
                  max_words=top_n,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

    fig, axs = plt.subplots(2,math.ceil(len(topics_dicts)/2), figsize=(15,10))
    axs = axs.ravel()
    for i in range(len(topics_dicts)):
        cloud.generate_from_frequencies(topics_dicts[i], max_font_size=300)
        axs[i].imshow(cloud)
        axs[i].set_title('Topic ' + str(i), fontdict=dict(size=16))
        axs[i].axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    fig.savefig(f'{research_imgs_dir}/{data_source_name}-topic-modelling.png')   # save the figure to file
    plt.show()
    plt.close(fig)
    print(f"Figure {enumeration}: Topic modelling for {data_source_name} data.\n")

# Code source: https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/nlp/topic-modeling-naive.html#topic=0&lambda=1&term=

def get_topics_meanings(tw_m,
                        vocab,
                        display_weights=False,
                        topn=5,
                        weight_cutoff=0.6):
    for i, topic_weights in enumerate(tw_m):  ## for each topic row
        topic = [(token, np.round(weight, 2))
                 for token, weight in zip(vocab, topic_weights)
                 ]  ## zip (word, importance_weight)
        topic = sorted(topic,
                       key=lambda x: -x[1])  ## rank words according to weights
        topic_topn = topic[:topn]
        if display_weights:
            topic = [item for item in topic_topn if item[1] > weight_cutoff
                     ]  ## output words whose weights > 0.6
            print(f"Topic #{i+1} :\n{topic}")
            print("=" * 20)
        else:
            topic_topn = ' '.join([word for word, weight in topic_topn])
            print(f"Topic #{i} :\n{topic_topn}")
            print('=' * 20)

In [9]:
random_state = 42
n_topics = 6
max_iterations = 10
n_top_words_per_topic = 25

for i, df in enumerate(lemmatised_dataframes):
    # Normalise prompts
    norm_corpus = [] #normalize_corpus(df["Prompts"]).flatten()
    for prompt in df.Prompts:
        norm_corpus.append(" ".join(prompt))

    # Transform corpus data into bag-of-words count matrix representation
    # Rows of the matrix represent the document, while columns are the vocabulary tokens
    #norm_corpus = ["aaaaa bbbb bbbb bbbb cccc", "bbbb", "cccc", "sffsdf cccc", "fdv cccc"]
    #stop_words = stopwords.words('english')
    cv = CountVectorizer(min_df=0., max_df=1., ngram_range = (1,4), stop_words=list(stop_words))
    cv_matrix = cv.fit_transform(norm_corpus)
    #X_train, X_test = train_test_split(cv_matrix, train_size=0.8, test_size=0.2, shuffle=True, random_state=random_state)
    vocabulary = cv.get_feature_names_out()

    # LDA modelling
    LDA_model = LatentDirichletAllocation(n_components=n_topics,
                                            max_iter=max_iterations,
                                            random_state=random_state)
    LDA_model.fit_transform(cv_matrix)
    #LDA_model.transform(X_test)

    #Coherence score calculation
    norm_corpus_tokens = [doc.split() for doc in norm_corpus]
    cur_coherence_score = metric_coherence_gensim(
            measure='c_v',
            top_n=5,
            topic_word_distrib=LDA_model.components_,
            dtm=cv.fit_transform(norm_corpus),
            vocab=np.array(cv.get_feature_names_out()),
            texts=norm_corpus_tokens)

    # Visualisation
    print(f"-----------------------------------------------------------------------\nTopic modelling for {dataframe_names[i]}.")
    topics_dicts = []
    for index, topic in enumerate(LDA_model.components_):
        #print(f'Top-{n_top_words_per_topic} for topic #{index}: {[cv.get_feature_names_out()[i] for i in topic.argsort()[-n_top_words_per_topic:]]}')
        topic_weights = [(token, np.round(weight, 2))
                         for token, weight in zip(vocabulary, topic)
                         ]  ## zip (word, importance_weight)
        topic_weights = sorted(topic_weights, key=lambda x: -x[1]) 
        word_weights = {}
        for weight in topic_weights:
            word_weights[weight[0]] = weight[1]
        topics_dicts.append(word_weights)
    get_topics_meanings(LDA_model.components_, vocabulary, display_weights=True, topn=10, weight_cutoff=20)
    topic_modelling_visualisation(n_top_words_per_topic, topics_dicts, dataframe_names[i], i)
    print(f'Model coherence score: {"%.4f" % np.mean(cur_coherence_score)}')
    print(f'Model perplexity: {"%.4f" % LDA_model.perplexity(cv_matrix)}')

-----------------------------------------------------------------------
Topic modelling for commits.
Topic #1 :
[('test', 156.03), ('use', 123.3), ('name', 92.91), ('code', 82.16), ('give', 74.33), ('need', 61.56), ('file', 57.15), ('step', 54.99), ('add', 53.41), ('user', 51.41)]
Topic #2 :
[('use', 118.67), ('height', 114.91), ('file', 109.28), ('ai', 104.6), ('name', 70.22), ('fill', 67.45), ('viewport', 62.68), ('log', 62.21), ('create', 61.36), ('skyscraper', 57.36)]
Topic #3 :
[('file', 3041.66), ('task', 1622.78), ('script', 952.64), ('use', 952.53), ('set', 911.58), ('create', 836.14), ('need', 831.0), ('solve', 818.89), ('solve task', 817.03), ('format', 585.25)]
Topic #4 :
[('test', 89.62), ('div', 89.51), ('window', 79.64), ('code', 72.96), ('file', 69.44), ('add', 65.48), ('use', 59.85), ('end', 55.8), ('name', 50.03), ('tower', 49.86)]
Topic #5 :
[('file', 1757.28), ('task', 1624.16), ('need', 1026.36), ('set', 900.39), ('solve', 832.74), ('solve task', 832.3), ('working',