In [1]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy
from pprint import pprint
import pandas as pd
import numpy as np

from tqdm import tqdm

import os
import mimetypes
import re

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['diff', 'git'])

In [3]:
# Set of ignoreWords in C (http://www.c4learn.com/c-programming/c-keywords/)
ignoreWordsC = {'auto', 'break', 'case', 'char', 'const', 'continue',
'default', 'do', 'double', 'else', 'enum', 'extern', 'float', 'for', 'goto',
'if', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof',
'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void',
'volatile', 'while'}

# Set of ignoreWords in C++ (https://www.w3schools.in/cplusplus-tutorial/keywords/)
ignoreWordsCpp = {'h', 'hpp', 'inl', 'alignas', 'alignof', 'and', 'asm',
'auto', 'bool', 'break', 'case', 'catch', 'char', 'class', 'const', 'continue',
 'decltype', 'default', 'delete', 'do', 'double', 'else', 'enum', 'explicit',
 'false', 'float', 'for', 'friend', 'goto', 'if', 'inline', 'int', 'long',
 'mutable', 'namespace', 'new', 'noexcept', 'not', 'nullptr', 'operator',
 'or', 'private', 'protected', 'public', 'register', 'return', 'short',
 'signed', 'sizeof', 'static', 'switch', 'template', 'this', 'throw', 'true',
 'try', 'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using',
 'virtual', 'void', 'volatile', 'while', 'xor', 'override', 'final', 'elif',
 'endif', 'ifdef', 'ifndef', 'define', 'undef', 'include', 'line', 'error',
 'pragma', 'defined'}

# Set of ignoreWords in Java (https://www.w3schools.com/java/java_ref_keywords.asp)
ignoreWordsJava = {'abstract', 'assert', 'boolean', 'break', 'byte', 'case',
'catch', 'char', 'class', 'continue', 'const', 'default', 'do', 'double',
'else', 'enum', 'exports', 'extends', 'final', 'finally', 'float', 'for',
'goto', 'if', 'implements', 'import', 'instanceof', 'int', 'interface',
'long', 'module', 'native', 'new', 'package', 'private', 'protected',
'public', 'requires', 'return', 'short', 'static', 'strictfp', 'super',
'switch', 'synchronized', 'this', 'throw', 'throws', 'transient', 'try',
'var', 'void', 'volatile', 'while'}

# set of ignoreWords in TypeScript
ignoreWordsTypeScript = {
    "abstract", "any", "as", "assert", "asserts", "async", "await", "bigint",
    "boolean", "break", "case", "catch", "class", "const", "constructor",
    "continue", "debugger", "declare", "default", "delete", "do", "else",
    "enum", "export", "extends", "false", "finally", "for", "from", "function",
    "get", "global", "if", "implements", "import", "in", "infer", "instanceof",
    "interface", "intrinsic", "is", "keyof", "let", "mixin", "module",
    "namespace", "never", "new", "null", "number", "object", "of", "out",
    "override", "private", "protected", "public", "readonly", "require",
    "return", "satisfies", "set", "static", "string", "super", "switch",
    "symbol", "target", "this", "throw", "true", "try", "type", "typeof",
    "undefined", "unique", "unknown", "var", "void", "while", "with", "yield"
}

# Set of ignoreWords in JavaScript (https://www.w3schools.com/JS/js_reserved.asp)
ignoreWordsJavaScript = {'abstract', 'arguments', 'await', 'boolean', 'break',
'byte', 'case', 'catch', 'char', 'class', 'const', 'continue', 'debugger',
'default', 'delete', 'do', 'double', 'else', 'enum', 'eval', 'export',
'extends', 'false', 'final', 'finally', 'float', 'for', 'function', 'goto',
'if', 'implements', 'import', 'in', 'instanceof', 'int', 'interface', 'let',
'long', 'native', 'new', 'null', 'package', 'private', 'protected', 'public',
'return', 'short', 'static', 'super', 'switch', 'synchronized', 'this',
'throw', 'throws', 'transient', 'true', 'try', 'typeof', 'var', 'void',
'volatile', 'while', 'with', 'yield'}

# Set of ignoreWords in PHP (https://www.php.net/manual/en/reserved.keywords.php)
ignoreWordsPHP = {'halt', 'compiler', 'abstract', 'and', 'array', 'as',
'break', 'callable', 'case', 'catch', 'class', 'clone', 'const', 'continue',
'declare', 'default', 'die', 'do', 'echo', 'else', 'elseif', 'empty',
'enddeclare', 'endfor', 'endforeach', 'endif', 'endswitch', 'endwhile',
'eval', 'exit', 'extends', 'final', 'finally', 'for', 'foreach', 'function',
'global', 'goto', 'if', 'implements', 'include', 'once', 'instanceof',
'insteadof', 'interface', 'isset', 'list', 'namespace', 'new', 'or', 'print',
'private', 'protected', 'public', 'require', 'return', 'static', 'switch',
'throw', 'trait', 'try', 'unset', 'use', 'var', 'while', 'xor', 'yield', 'from'}

# Set of ignoreWords in Python (https://www.w3schools.com/python/python_ref_keywords.asp)
ignoreWordsPython = {'and', 'as', 'assert', 'break', 'class', 'continue',
'def', 'del', 'elif', 'else', 'except', 'False', 'finally', 'for', 'from',
'global', 'if', 'import', 'in', 'is', 'lambda', 'None', 'nonlocal', 'not',
'or', 'pass', 'raise', 'return', 'True', 'try', 'while', 'with', 'yield'}

# Set of ignoreWords in Ruby (https://docs.ruby-lang.org/en/2.2.0/keywords_rdoc.html)
ignoreWordsRuby = {'encoding', 'line', 'file', 'begin', 'end', 'alias', 'and',
'begin', 'break', 'case', 'class', 'def', 'defined', 'do', 'else', 'elsif',
'end', 'ensure', 'false', 'for', 'if', 'in', 'module', 'next', 'nil', 'not',
'or', 'redo', 'rescue', 'retry', 'return', 'self', 'super', 'then', 'true',
'undef', 'unless', 'until', 'when', 'while', 'yield'}

stop_words.extend(ignoreWordsC
                | ignoreWordsCpp
                | ignoreWordsJava
                | ignoreWordsTypeScript
                | ignoreWordsJavaScript
                | ignoreWordsPHP
                | ignoreWordsPython
                | ignoreWordsRuby)

In [7]:
# https://stackoverflow.com/questions/18394147/how-to-do-a-recursive-sub-folder-search-and-return-files-in-a-list
def run_fast_scandir(dir):    # dir: str, ext: list
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if mimetypes.guess_type(f.path)[0] and re.search(r"(text\/|application\/)", mimetypes.guess_type(f.path)[0]):
                files.append(f.path)

    for dir in list(subfolders):
        sf, f = run_fast_scandir(dir)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files

def getfile(fn):
    try:
        with open(fn) as f:
            return f.read()
    except UnicodeDecodeError:
        print(f"couldn't get {fn}.")
        return None

In [8]:
sf, files = run_fast_scandir('../vscode')
len(files)

8619

In [49]:
d = {"name": [fn[3:] for fn in files if getfile(fn)], "text": [getfile(fn).replace("\n", " ") for fn in files if getfile(fn)]}
df = pd.DataFrame(data=d)
print(len(df))
df.to_csv("../corpus/vscode_files.csv", index=False)

couldn't get ../vscode/src/vs/workbench/services/search/test/node/fixtures/some_utf16be.css.
couldn't get ../vscode/src/vs/workbench/services/search/test/node/fixtures/some_utf16le.css.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/utf16_be_nobom.txt.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some.png.txt.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some.shiftjis.1.txt.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some_gbk.txt.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some.pdf.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some_utf16be.css.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/some.qwoff.txt.
couldn't get ../vscode/src/vs/workbench/services/textfile/test/node/encoding/fixtures/utf16_le_nobo

In [50]:
df = pd.read_csv("../corpus/vscode_files.csv")
df.head()

Unnamed: 0,name,text
0,vscode/CodeQL.yml,path_classifiers: test: # Classify all f...
1,vscode/.lsifrc.json,"{ \t""project"": ""src/tsconfig.json"", \t""source""..."
2,vscode/tsfmt.json,"{ \t""tabSize"": 4, \t""indentSize"": 4, \t""conver..."
3,vscode/package-lock.json,"{ ""name"": ""code-oss-dev"", ""version"": ""1.11..."
4,vscode/cgmanifest.json,"{ \t""registrations"": [ \t\t{ \t\t\t""component""..."


In [9]:
def sent_to_words(sentences):
    for sentence in tqdm(sentences):
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [10]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts)]

In [11]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#nlp.max_length = max([len(" ".join(sent)) for sent in data_words_nostops])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        text = " ".join(sent)
        doc = []
        if len(text) <= nlp.max_length:            
            doc = nlp(text)
        else:
            text = ""
            for w in sent:
                if len(text) + 1 + len(w) > nlp.max_length:
                    doc += nlp(text)
                    text = ""
                text += w + " "
            doc += nlp(text)
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

Installing spacy


In [54]:
doc_added = []
for i in range(len(df["name"].tolist())):
    doc = df["text"].tolist()[i]
    doc_added.append(doc)

In [55]:
doc_added[4]



In [56]:
data = doc_added
data_words = list(sent_to_words(data))

  0%|          | 0/8566 [00:00<?, ?it/s]

100%|██████████| 8566/8566 [00:18<00:00, 470.93it/s] 


In [57]:
# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)

Start removing stop words


100%|██████████| 8566/8566 [00:39<00:00, 214.60it/s]


In [58]:
# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Start lemmatizing words


100%|██████████| 8566/8566 [04:03<00:00, 35.14it/s]  


In [59]:
print(data[10])
print(data_lemmatized[10])

// ----------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------- // This file overrides licenses only for OSS components which do not appear in `cgmanifest.json`. // i.e. for OSS components that are detected from `package-lock.json` or `Cargo.lock` files. // // DO NOT EDIT THIS FILE UNLESS THE OSS TOOL INDICATES THAT YOU SHOULD. // [ 	{ 		// Reason: The license at https://github.com/aadsm/jschardet/blob/master/LICENSE 		// does not include a clear Copyright statement and does not credit authors. 		"name": "jschardet", 		"prependLicenseText": [ 			"Chardet was originally ported from C++ by Mark Pilgrim. It is now maintained", 			" by Dan Blanchard and Ian Cordasco, and was formerly maintained by Erik Rose.", 			" JSChardet was ported from python to JavaScript by António Afonso ", 			" (https://github.com/aadsm/jschardet) and transformed into an npm package by ", 			"M

In [60]:
data_lemmatized_min_length = []

for sublist in tqdm(data_lemmatized):
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 2]
    data_lemmatized_min_length.append(sublist)

  0%|          | 0/8566 [00:00<?, ?it/s]

100%|██████████| 8566/8566 [00:00<00:00, 14819.81it/s]


In [61]:
print(data[0])
print(data_lemmatized[0])
print(data_lemmatized_min_length[0])

path_classifiers:   test:     # Classify all files in the top-level directories test/ and testsuites/ as test code.     - test     # Classify all files with suffix `.test` as test code.     # Note: use only forward slash / as a path separator.     # *  Matches any sequence of characters except a forward slash.     # ** Matches any sequence of characters, including a forward slash.     # This wildcard must either be surrounded by forward slash symbols, or used as the first segment of a path.     # It matches zero or more whole directory segments. There is no need to use a wildcard at the end of a directory path because all sub-directories are automatically matched.     # That is, /anything/ matches the anything directory and all its subdirectories.     # Always enclose the expression in double quotes if it includes *.     - "**/*.test.ts"    # The default behavior is to tag all files created during the   # build as `generated`. Results are hidden for generated code. You can tag   # furt

In [62]:
import json

dump = json.dumps(data_lemmatized_min_length)
with open('../corpus/vscode_preprocessed.json', 'w', encoding='utf-8') as f:
    f.write(dump)

In [4]:
import json

data_lemmatized_min_length = None

with open('../corpus/vscode_preprocessed.json', 'r', encoding='utf-8') as f:
    data_lemmatized_min_length = json.load(f)

In [5]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[:1])

[[(0, 2), (1, 1), (2, 2), (3, 1), (4, 2), (5, 3), (6, 2), (7, 2), (8, 1), (9, 7), (10, 1), (11, 5), (12, 1), (13, 1), (14, 1), (15, 1), (16, 5), (17, 1), (18, 4), (19, 6), (20, 2), (21, 2), (22, 1), (23, 1), (24, 6), (25, 5), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 3), (32, 1), (33, 2), (34, 2), (35, 2), (36, 1), (37, 2), (38, 4), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 4), (45, 7), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1)]]


In [64]:
# Number of documents
print(len(corpus))

# Size of the vocabulary
print(len(id2word))

8566
143047


In [65]:
import statistics

def sum_of_second_components(tuple_list):
    total_sum = 0
    for tup in tuple_list:
        total_sum += tup[1]  # Accessing the second component of each tuple
    return total_sum

lengths = []
for doc in corpus:
    lengths.append(sum_of_second_components(doc))
print(statistics.median(lengths))

202.0


In [66]:
from gensim.models import LsiModel

K = 15
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=K)
lsi_model.print_topics(num_topics=K, num_words=10)

[(0,
  '0.575*"example" + 0.542*"node_modules" + 0.490*"user" + 0.172*"extension" + 0.120*"dist" + 0.086*"users" + 0.085*"feature" + 0.080*"lib" + 0.079*"map" + 0.074*"language"'),
 (1,
  '0.423*"node" + 0.380*"name" + 0.316*"source" + 0.304*"kind" + 0.207*"sorttext" + 0.200*"kindmodifiers" + 0.182*"hasaction" + 0.161*"syntaxkind" + 0.158*"code" + 0.157*"flag"'),
 (2,
  '-0.495*"node" + 0.266*"sorttext" + 0.257*"kindmodifiers" + 0.233*"hasaction" + 0.197*"vscode" + 0.196*"jrieken" + 0.193*"code" + 0.191*"src" + -0.189*"syntaxkind" + -0.184*"flag"'),
 (3,
  '-0.513*"meta" + -0.249*"scss" + -0.229*"css" + -0.189*"cpp" + -0.182*"light_plus" + -0.182*"hc_light" + -0.182*"light_modern" + -0.182*"dark_modern" + -0.182*"light_vs" + -0.182*"dark_plus"'),
 (4,
  '-0.609*"cpp" + -0.320*"name" + -0.282*"comment" + -0.278*"cuda" + 0.228*"scss" + 0.196*"css" + -0.191*"block" + -0.175*"punctuation" + -0.168*"definition" + 0.148*"property"'),
 (5,
  '-0.431*"meta" + 0.391*"name" + 0.350*"css" + 0.343

In [67]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 15
nmf_model = Nmf(corpus, id2word=id2word, num_topics=K)
nmf_model.show_topics(num_topics=K, num_words=10)

[(0,
  '0.046*"node" + 0.017*"syntaxkind" + 0.017*"flag" + 0.016*"declaration" + 0.013*"parent" + 0.012*"typeflag" + 0.011*"diagnostic" + 0.010*"signature" + 0.010*"type" + 0.009*"name"'),
 (1,
  '0.042*"node" + 0.016*"syntaxkind" + 0.016*"flag" + 0.015*"declaration" + 0.013*"parent" + 0.012*"name" + 0.011*"typeflag" + 0.011*"signature" + 0.010*"diagnostic" + 0.010*"result"'),
 (2,
  '0.035*"node" + 0.017*"https" + 0.015*"resolve" + 0.014*"sha" + 0.013*"org" + 0.012*"version" + 0.012*"registry" + 0.012*"tgz" + 0.012*"integrity" + 0.011*"dev"'),
 (3,
  '0.022*"link" + 0.019*"param" + 0.015*"uri" + 0.014*"range" + 0.012*"editor" + 0.011*"value" + 0.011*"provider" + 0.010*"event" + 0.009*"document" + 0.009*"item"'),
 (4,
  '0.012*"node" + 0.012*"link" + 0.010*"param" + 0.007*"workspace" + 0.006*"value" + 0.005*"text" + 0.005*"syntaxkind" + 0.005*"flag" + 0.005*"azure" + 0.005*"declaration"'),
 (5,
  '0.074*"name" + 0.064*"kind" + 0.061*"sorttext" + 0.059*"kindmodifiers" + 0.055*"source" +

In [None]:
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

Ks = [5, 10, 15, 20]
lda_models = {}

for K in tqdm(Ks):
    lda_models[K] = LdaModel(corpus=corpus,
                            id2word=id2word,
                            num_topics=K,
                            random_state=100,
                            update_every=1,
                            chunksize=400,
                            passes=15,
                            alpha='auto')

100%|██████████| 6/6 [17:01<00:00, 170.27s/it]


In [None]:
coherences = {}
for K in tqdm(Ks):    
    coherence_model_lda = CoherenceModel(model=lda_models[K], texts=texts, dictionary=id2word, coherence='c_v')
    coherences[K] = coherence_model_lda.get_coherence()

coherences

  0%|          | 0/6 [00:00<?, ?it/s]

In [70]:
best_K = max(coherences, key=coherences.get)
lda_model = lda_models[best_K]
pprint(lda_model.print_topics())

[(0,
  '0.174*"description" + 0.048*"action" + 0.045*"command" + 0.041*"localize" + '
  '0.033*"group" + 0.030*"input" + 0.028*"label" + 0.026*"title" + '
  '0.022*"args" + 0.018*"enable"'),
 (1,
  '0.148*"name" + 0.138*"meta" + 0.065*"keyword" + 0.058*"comment" + '
  '0.050*"definition" + 0.049*"cpp" + 0.044*"variable" + 0.037*"entity" + '
  '0.036*"block" + 0.032*"constant"'),
 (2,
  '0.162*"license" + 0.057*"microsoft" + 0.050*"project" + 0.049*"copyright" + '
  '0.048*"root" + 0.046*"right" + 0.041*"txt" + 0.041*"see" + 0.041*"mit" + '
  '0.039*"information"'),
 (3,
  '0.236*"vscode" + 0.089*"users" + 0.082*"src" + 0.073*"workbench" + '
  '0.051*"user" + 0.028*"contrib" + 0.025*"lite" + 0.024*"code" + '
  '0.023*"caniuse" + 0.018*"browser"'),
 (4,
  '0.465*"source" + 0.179*"property" + 0.090*"declaration" + 0.079*"kind" + '
  '0.076*"name" + 0.030*"method" + 0.006*"node" + 0.004*"pane" + '
  '0.001*"editoraction" + 0.001*"iuritransformer"'),
 (5,
  '0.164*"example" + 0.149*"node_mo

In [71]:
lda_model.save('../corpus/model')

In [6]:
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel.load('../corpus/model')

In [14]:
_repo_path_template = os.path.join('{src_dir}', '{owner}', '{repo}')
_diff_path_template = os.path.join('{src_dir}', '{owner}', '{repo}', 'pull-{pull_number}.diff')

_diff_location_pattern = re.compile(r'@@.*?@@ (.*?)$')

def _sorted_pull_numbers(src_dir, owner, repo):
    filenames = os.listdir(_repo_path_template.format(src_dir=src_dir, owner=owner, repo=repo))
    pull_numbers = [int(f[5:-5]) for f in filenames if f.startswith('pull-') and f.endswith('.diff')]
    pull_numbers.sort()
    return pull_numbers
    
def _read_diff(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.readlines()
    
def _preprocess_diff(diff):
    lines = []
    for line in diff:
        if line.startswith('diff --git') or line.startswith('index ') or line.startswith('+++') or line.startswith('---'):
            continue
        location_match = _diff_location_pattern.match(line)
        if location_match:
            lines.append(location_match.group(1))
            continue
        if line.startswith('+') or line.startswith('-'):
            lines.append(line[1:])
            continue
        lines.append(line)
    text = " ".join(lines).replace("\n", " ")
    return text
    
    
src_dir = '../repos'
owner = 'microsoft'
repo = 'vscode'

preprocessed = []

for pull_number in tqdm(_sorted_pull_numbers(src_dir=src_dir, owner=owner, repo=repo)):
    try: 
        diff = _read_diff(_diff_path_template.format(src_dir=src_dir, owner=owner, repo=repo, pull_number=pull_number))
        preprocessed.append(_preprocess_diff(diff))
    except:
        preprocessed.append("")

diff_words = list(sent_to_words(preprocessed))
diff_words_nostops = remove_stopwords(diff_words)
diff_lemmatized = lemmatization(diff_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
diff_lemmatized_min_length = []
for sublist in tqdm(diff_lemmatized):
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 2]
    diff_lemmatized_min_length.append(sublist)

test_corpus = [id2word.doc2bow(text) for text in diff_lemmatized_min_length]

100%|██████████| 6323/6323 [00:01<00:00, 3987.06it/s]
100%|██████████| 6323/6323 [00:15<00:00, 405.84it/s]
100%|██████████| 6323/6323 [00:31<00:00, 201.33it/s]
100%|██████████| 6323/6323 [03:14<00:00, 32.51it/s] 
100%|██████████| 6323/6323 [00:00<00:00, 15988.96it/s]


In [15]:
print(diff_words[100])
print(diff_words_nostops[100])
print(diff_lemmatized_min_length[100])
lda_model[test_corpus[100]]

['import', 'renderlineinput', 'renderviewline', 'from', 'common', 'viewlayou', 'import', 'ghosttext', 'ighosttextline', 'from', 'model', 'ghosttext', 'js', 'import', 'rangesingleline', 'from', 'common', 'core', 'ranges', 'rangesingleline', 'js', 'import', 'columnrange', 'from', 'common', 'core', 'ranges', 'columnrange', 'js', 'import', 'offsetrange', 'from', 'common', 'core', 'ranges', 'offsetrange', 'js', 'import', 'getwindow', 'ishtmlelement', 'from', 'base', 'browser', 'dom', 'js', 'import', 'ghosttextview', 'css', 'import', 'imouseevent', 'from', 'base', 'browser', 'mouseevent', 'js', 'export', 'class', 'ghosttextview', 'extends', 'disposable', 'const', 'inlinetexts', 'additionallines', 'hiddenrange', 'ghosttext', 'textmodel', 'extraclassnames', 'const', 'currentline', 'textmodel', 'getlinecontent', 'ghosttext', 'linenumber', 'const', 'edit', 'new', 'stringedit', 'inlinetexts', 'map', 'insert', 'column', 'text', 'const', 'tokens', 'textmodel', 'tokenization', 'tokenizelinesat', 'gh

[(8, np.float32(0.053735718)),
 (10, np.float32(0.010652457)),
 (11, np.float32(0.06339004)),
 (13, np.float32(0.042028733)),
 (14, np.float32(0.6195278)),
 (16, np.float32(0.19756119))]

In [16]:
import csv

with open('../corpus/probs.csv', 'w', newline='') as dataset_file:
    dataset = csv.writer(dataset_file)
    dataset.writerow(['pull_number'] + ["_".join([f"{t[0]}*{t[1]:.2f}" for t in lda_model.show_topic(i, topn=3)]) for i in range(lda_model.num_topics)])
    for idx, pull_number in enumerate(tqdm(_sorted_pull_numbers(src_dir=src_dir, owner=owner, repo=repo))):
        probs = lda_model[test_corpus[idx]]
        newrow = [0 for _ in range(lda_model.num_topics)]
        for (i, p) in probs:
            newrow[i] = p
        dataset.writerow([pull_number] + newrow)

100%|██████████| 6323/6323 [00:01<00:00, 4214.00it/s]
