In [1]:
!python3 -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/fernandagomes/Library/Python/3.6/lib/python/site-packages/en_core_web_sm
-->
/usr/local/Cellar/python3/3.6.4_2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
!pip3 install --upgrade numpy

Requirement already up-to-date: numpy in /usr/local/lib/python3.6/site-packages (1.16.2)


In [3]:
import codecs
from collections import defaultdict
import csv
import lzma
import operator
import re
from statistics import mean
import sys

import gensim
import numpy as np
import spacy
from spacy import displacy
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import matplotlib.pyplot as plt
import sklearn
import keras

Using TensorFlow backend.


In [4]:
import os
thisdir = os.getcwd()
PATH_TO_CSV = str(thisdir) + "/review_comments.csv.xz"

## Understanding our dataset

In [5]:
def getusers(filename):
    with lzma.open(filename) as archf:
        reader = csv.DictReader(codecs.getreader("utf-8")(archf))
        for record in reader:
            yield record["AUTHOR"]

In [6]:
n_comments = 0
users = defaultdict(int)
for user in getusers(PATH_TO_CSV):
    n_comments += 1
    if n_comments % 100000 == 0:
        sys.stderr.write("%d\r" % n_comments)
    users[user] += 1

25300000

In [7]:
print("Number of PR review comments: %d" % (n_comments))
print("Number of different reviewers: %d" % (len(users)))
print("Average number of GitHub review comments per user from 2015 to 2019: %d" % (mean(list(users.values()))))

Number of PR review comments: 25323640
Number of different reviewers: 540054
Average number of GitHub review comments per user from 2015 to 2019: 46


## Most active reviewers

In [8]:
sorted_users = sorted(users.items(), key=operator.itemgetter(1), reverse=True)
sorted_users[:10]

[('houndci-bot', 797827),
 ('houndci', 264998),
 ('codacy-bot', 237814),
 ('stickler-ci', 36707),
 ('sonarcloud[bot]', 20378),
 ('jreback', 19694),
 ('seanlip', 19540),
 ('codeschool-kiddo', 18946),
 ('stephentoub', 18744),
 ('vkurennov', 18141)]

## Collecting comments in English

In [10]:
import spacy
from spacy_langdetect import LanguageDetector

nlp = spacy.load("en")
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

def lang_det(text):
    doc = nlp(text)
    lang_dict = doc._.language
    return lang_dict

def get_lang_score(text):
    lang_dict = lang_det(text)
    language = lang_dict['language']
    score = lang_dict['score']
    return language, score

In [11]:
import random
size_of_sample = 1300000  # say the math behind it
file_length = n_comments  # dataset CSV size

samples_index = sorted(random.sample(range(file_length), size_of_sample))  # random index that fits the CSV

multilang_sample = []  # sample before filtering for EN

# get the actual multi-lang sample

with lzma.open(PATH_TO_CSV) as archf:
    csv_reader = csv.DictReader(codecs.getreader("utf-8")(archf))
    i = 0
    for r, record in enumerate(csv_reader):
        try:
            if r == samples_index[i]:
            sys.stderr.write("%d / %d\r" % (len(multilang_sample), r))
            multilang_sample.append(record['BODY'])
            i += 1
        except:
            next

1299999 / 25323630

IndexError: list index out of range

In [12]:
eng_sample = []

for i, item in enumerate(multilang_sample):
    try:
        lang, score = get_lang_score(item)
    except:
        lang = ""
        score = 0.0
    sys.stderr.write("%d / %d\r" % (len(eng_sample), i))
    if lang == "en" and score >= 0.6:
        eng_sample.append(item)

1048358 / 1299999

In [16]:
len(eng_sample)

1055449

Our comments are formatted using Markdown, meaning that we have to clean it.

We'll start small:

1. First, transforming markdown to html
2. Turn code snippets into one symbol (`$`)
3. Discard formatting tags and collect only the remaining text

In [17]:
from bs4 import BeautifulSoup
from markdown import markdown

def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', '$', html)
    html = re.sub(r'<code>(.*?)</code>', '$', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

plain_text = [markdown_to_text(item) for item in eng_sample]

In [18]:
plain_text[:50]

['why the change of server? if one fails maybe allow configuring the server?',
 'you should do this instead, avoid embarrassment.',
 'Interesting, I see that too. $ shows the right thing.',
 'some comments on when this is supposed to return true/false would be helpful.',
 'Chris also "kinda like[s]" $',
 'Should a MongoIterable be allowed to return null from batchCursor?  I would think the contract should be that either result or t is non-null.',
 "Make sure to test the string option, because I think it'll never get hit.",
 'A review comment message',
 'Great Job. this Technically works but on lines 21 and 22 you are "hard coding" the values of n and d. These values are evaluated correctly, but since you are hard coding the values of n and d, you are overwriting any arguments being passed into the function. Therefore console.log(fractionString(3,4)); will output 1 3/4 not just 3/4.',
 '$ just in case :)',
 "I'm confused. Can n be negative?",
 'Whitespace creep in a few places around he

Now we will use `spaCy` to input the comments, and output a list of tokens in English language

In [19]:
doc = [nlp(i.lower()) for i in plain_text]
doc[:50]

[why the change of server? if one fails maybe allow configuring the server?,
 you should do this instead, avoid embarrassment.,
 interesting, i see that too. $ shows the right thing.,
 some comments on when this is supposed to return true/false would be helpful.,
 chris also "kinda like[s]" $,
 should a mongoiterable be allowed to return null from batchcursor?  i would think the contract should be that either result or t is non-null.,
 make sure to test the string option, because i think it'll never get hit.,
 a review comment message,
 great job. this technically works but on lines 21 and 22 you are "hard coding" the values of n and d. these values are evaluated correctly, but since you are hard coding the values of n and d, you are overwriting any arguments being passed into the function. therefore console.log(fractionstring(3,4)); will output 1 3/4 not just 3/4.,
 $ just in case :),
 i'm confused. can n be negative?,
 whitespace creep in a few places around here.,
 there is a slight

Further cleaning: we'll remove stopwords and create a list of the remaining words

In [20]:
my_stop_words = []
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [21]:
master_text = []
for d in doc:
    texts, article = [], []
    for w in d:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article.append(w.lemma_)
        # if it's a new line, it means we're onto our next document
        if w.text == '\n':
            texts.append(article)
            article = []
    master_text.append(article)

Let's identify pairs of words that are mentioned together and create bigrams

In [22]:
bigram = gensim.models.Phrases(master_text)

In [23]:
master_text = [bigram[line] for line in master_text]

In [24]:
dictionary = Dictionary(master_text)
corpus = [dictionary.doc2bow(tex) for tex in master_text]

In [25]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [26]:
ldamodel.show_topics()

[(0,
  '0.041*"error" + 0.024*"default" + 0.021*"option" + 0.020*"issue" + 0.019*"agree" + 0.019*"commit" + 0.017*"log" + 0.015*"try" + 0.013*"think" + 0.012*"yeah"'),
 (1,
  '0.250*"need" + 0.093*"fix" + 0.074*"update" + 0.056*"thank" + 0.018*"right" + 0.015*"sorry" + 0.012*"index" + 0.011*"forget" + 0.011*">" + 0.010*"👍"'),
 (2,
  '0.124*"use" + 0.055*"method" + 0.032*"class" + 0.031*"instead" + 0.029*"function" + 0.015*"variable" + 0.015*"name" + 0.013*" " + 0.012*"maybe" + 0.010*"parameter"'),
 (3,
  '0.055*"find" + 0.033*"=" + 0.032*"`" + 0.023*" _issue" + 0.018*"`_`" + 0.018*">" + 0.018*"import" + 0.015*"number" + 0.014*"\n    " + 0.014*"would"'),
 (4,
  '0.431*"$" + 0.022*"return" + 0.019*"check" + 0.016*"think" + 0.016*"value" + 0.016*"miss" + 0.013*"instead" + 0.012*"call" + 0.012*"like" + 0.011*"function"'),
 (5,
  '0.104*"line" + 0.104*"add" + 0.031*"space" + 0.023*"new" + 0.021*"end" + 0.021*"comment" + 0.020*"delete" + 0.020*"message" + 0.018*"link" + 0.013*"block"'),
 (6,

In [27]:
# from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel

# hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [28]:
# hdpmodel.show_topics()

In [30]:
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
