In [76]:
!git clone https://github.com/grazder/prod_stories_topic.git
%cd prod_stories_topic/

Cloning into 'prod_stories_topic'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 1), reused 11 (delta 0), pack-reused 0[K
Unpacking objects: 100% (14/14), done.
/content/prod_stories_topic/prod_stories_topic


In [77]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [78]:
data = pd.read_json('data/pycharm_issues.json', lines=True)
keep_versions = ["2020.2", "2020.3", "2021.1", "2021.2", "2021.3"]

versions = []
for version in data['Affected versions'].values:
    versions.extend(version)

print('Доступные версии: ', np.unique(versions))

data = data[data['Affected versions'].apply(lambda x: any([y in keep_versions for y in x]))]

print(data.shape)

Доступные версии:  ['2016.1' '2016.2' '2016.3' '2017.1' '2017.2' '2017.3' '2018.1' '2018.2'
 '2018.3' '2019.1' '2019.2' '2019.3' '2020.1' '2020.2' '2020.3' '2021.1'
 '2021.2' '2021.3']
(2296, 5)


# TF-IDF

In [79]:
data.head()

Unnamed: 0,idReadable,created,summary,description,Affected versions
45,PY-22211,1484097590051,Running subprocesses with Popen inside unittes...,I have the following unittest:\n\n```\nclass M...,[2020.3]
67,PY-22248,1484322653638,Full project optimize imports alters library f...,Reported by: @matheusbrat on Twitter: https://...,[2020.2]
178,PY-22369,1485334537463,DataFrame auto-update doesn't work with action...,Consider this example:\n\n```python\nimport pa...,[2021.1]
266,PY-22469,1485863013851,PowerShell prompt doesn't show virtualenv,"Although the virtualenv seems activated, the p...",[2021.1]
730,PY-23007,1488923938785,Project requirements are not detected from set...,Since setuptools 30.3.0 it's possible to speci...,[2020.3]


Для начала построим простой TF-IDF и посмотрим на самые важные слова

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def calculate_top_10_tfidf(df, vectorizer):
    X = vectorizer.fit_transform(df['summary']).toarray()

    feature_names = vectorizer.get_feature_names()
    global_top10_idx = X.max(axis=0).argsort()[-10:]
    top10 = list(np.asarray(feature_names)[global_top10_idx])

    print('Самые популярные слова по TFIDF:\n', top10)

calculate_top_10_tfidf(data, TfidfVectorizer())

Самые популярные слова по TFIDF:
 ['dir', 'inferrence', 'help', 'clone', 'property', 'lag', 'pdm', 'builtins', 'str', 'console']


Теперь посмотрим по версиям

In [81]:
for version in keep_versions:
    version_df = data[data['Affected versions'].apply(lambda x: version in x)]
    print('Version', version)

    calculate_top_10_tfidf(version_df, TfidfVectorizer())

Version 2020.2
Самые популярные слова по TFIDF:
Version 2020.3
Самые популярные слова по TFIDF:
 ['implementation', 'get_user_model', 'autocompletion', 'super', 'stale', 'matplotlib', 'loop', 'tox', 'global', 'collections']
Version 2021.1
Самые популярные слова по TFIDF:
 ['mistake', 'suggestions', 'list', 'coroutine', 'character', 'libraries', 'detecting', 'collections', 'pdm', 'help']
Version 2021.2
Самые популярные слова по TFIDF:
 ['responsive', 'coroutine', 'pid', 'keras', 'bug', 'inferrence', 'dir', 'str', 'lag', 'builtins']
Version 2021.3
Самые популярные слова по TFIDF:


Посмотрим для сравнения при помощи CountVectorizer

In [82]:
for version in keep_versions:
    version_df = data[data['Affected versions'].apply(lambda x: version in x)]
    print('Version', version)

    calculate_top_10_tfidf(version_df, CountVectorizer())

Version 2020.2
Самые популярные слова по TFIDF:
 ['flask', 'method', 'to', 'size', 'type', 'files', 'property', 'line', 'the', 'venv']
Version 2020.3
Самые популярные слова по TFIDF:
 ['global', 'in', 'editor', 'template', 'coverage', 'plot', 'else', 'of', 'import', 'it']
Version 2021.1
Самые популярные слова по TFIDF:
 ['fastapi', 'usages', 'module', 'is', 'self', 'the', 'of', 'coverage', 'in', 'list']
Version 2021.2
Самые популярные слова по TFIDF:
 ['packages', 'from', 'services', 'interpreter', 'type', 'generic', 'to', 'the', 'plugin', 'str']
Version 2021.3
Самые популярные слова по TFIDF:
 ['type', 'compose', 'no', 'ssh', 'icons', 'as', 'set', 'self', 'run', 'str']


Видим, что все поменялось, но здесь остались более частые слова, типа no as run и тд. Что плохо, т.к. это не дает нам какой-то информации о том, что было сломано

Видим, что TFIDF рабоатет получше. Посмотрим теперь еще по двум словам

In [83]:
for version in keep_versions:
    version_df = data[data['Affected versions'].apply(lambda x: version in x)]
    print('Version', version)

    calculate_top_10_tfidf(version_df, TfidfVectorizer(ngram_range=(2, 2)))

Version 2020.2
Самые популярные слова по TFIDF:
 ['improve documentation', 'import pandas', 'documentation readability', 'unused import', 'alias decorator', 'stackoverflowerror in', 'work ctrl', 'broken find', 'debug not', 'hi hello']
Version 2020.3
Самые популярные слова по TFIDF:
 ['stub error', 'pytest_cache directory', 'ignore pytest_cache', 'and property', 'never completes', 'classmethod and', 'for loop', 'psycopg2 stub', 'false unresolved', 'super autocompletion']
Version 2021.1
Самые популярные слова по TFIDF:
 ['ide performance', 'classmethod and', 'generic aliases', 'hangs randomly', 'wrong import', 'poor ide', 'update bundled', 'incorrect type', 'pdm support', 'help function']
Version 2021.2
Самые популярные слова по TFIDF:
 ['support polars', '__class_getitem__ incorrectly', 'interpreter pid', 'test autodetect', 'incorrectly typechecks', 'slow code', 'type inferrence', 'builtins module', 'inspection bug', 'typing lag']
Version 2021.3
Самые популярные слова по TFIDF:


Видим, что в 2020.2 проблемы с документацией. А в 2020.3 что-то ломается с pytest_cache

# Решение через BigArtm

In [84]:
!gunzip docword.kos.txt.gz

In [85]:
!pip install bigartm10



In [86]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from numpy import array
import artm

cv = TfidfVectorizer(stop_words='english')
n_wd = array(cv.fit_transform(data['summary']).todense()).T
vocabulary = cv.get_feature_names()

bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

model = artm.LDA(num_topics=15, dictionary=bv.dictionary)
model.fit_offline(bv, num_collection_passes=20)

# Print results
model.get_top_tokens()

Exception ignored in: <function BatchVectorizer.__del__ at 0x7f52103d3d40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/artm/batches_utils.py", line 137, in __del__
  File "/usr/local/lib/python3.7/dist-packages/artm/batches_utils.py", line 130, in __dispose
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
FileNotFoundError: [Errno 2] No such file or directory: 'urnuuid5566fcc8-5784-11ec-b0d7-0242ac1c0002'


[['docker',
  'file',
  'jupyter',
  'window',
  'compose',
  'unable',
  'packages',
  'interpreter',
  'notebook',
  'use'],
 ['debugger',
  'new',
  'windows',
  'process',
  'failed',
  'pycharm',
  'hangs',
  'object',
  'multiple',
  'attach'],
 ['code',
  'completion',
  'debug',
  'running',
  'tests',
  'mode',
  'modules',
  'imported',
  'script',
  'names'],
 ['working',
  'parameter',
  'path',
  'functions',
  'installed',
  'work',
  'execution',
  'package',
  'action',
  'ctrl'],
 ['type',
  'incorrect',
  'instead',
  'inference',
  'generic',
  'expected',
  'dict',
  'annotations',
  'checker'],
 ['wrong',
  'auto',
  'method',
  'test',
  'missing',
  'imports',
  'parameters',
  'different',
  'local',
  'autocomplete'],
 ['fails',
  'remote',
  'version',
  'properly',
  'ssh',
  'interpreter',
  'allow',
  'stuck',
  'available',
  'typevar'],
 ['view',
  'cell',
  'debugging',
  'dataframe',
  'exception',
  'sciview',
  'jupyter',
  'shown',
  'plots',
  'disp

Посмотрим что-то для версий

In [93]:
cv = TfidfVectorizer(stop_words='english')
version_df = data[data['Affected versions'].apply(lambda x: '2020.2' in x)]

n_wd = array(cv.fit_transform(version_df['summary']).todense()).T
vocabulary = cv.get_feature_names()

bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

model = artm.LDA(num_topics=30, dictionary=bv.dictionary)
model.fit_offline(bv, num_collection_passes=10)

# Print results
model.get_top_tokens(5)

[['does', 'docker', 'compose', 'work', 'property'],
 ['updates', 'multiple', 'shortcut', 'pycharm', 'numpy'],
 ['code', 'running', 'completion', 'tests', 'pytest'],
 ['results', 'search', 'path', 'notebook', 'display'],
 ['type', 'generic', 'expected', 'instead', 'got'],
 ['type', 'wrong', 'hinting', 'correct', 'isn'],
 ['interpreter', 'remote', 'issues', 'preview', 'environment'],
 ['cell', 'argument', 'unexpected', 'value', 'inspection'],
 ['django', 'typing', 'settings', 'types', 'error'],
 ['interpreter', 'based', 'test', 'generator', 'docker'],
 ['pycharm', 'install', 'packages', 'usages', 'correctly'],
 ['2020', 'pycharm', 'object', 'flask', 'callable'],
 ['doesn', 'work', '2020', 'does', 'django'],
 ['pip', 'hangs', 'attributes', 'feature', 'txt'],
 ['console', 'python', 'terminal', 'documentation', 'issue'],
 ['update', 'notebook', 'jupyter', 'dict', 'action'],
 ['debugger', 'failed', 'process', 'python', 'step'],
 ['project', 'diagram', 'dependency', 'variable', 'model'],
 ['f

Теперь для версии 2021.3

In [91]:
cv = TfidfVectorizer(stop_words='english')
version_df = data[data['Affected versions'].apply(lambda x: '2020.3' in x)]

n_wd = array(cv.fit_transform(version_df['summary']).todense()).T
vocabulary = cv.get_feature_names()

bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

model = artm.LDA(num_topics=30, dictionary=bv.dictionary)
model.fit_offline(bv, num_collection_passes=10)

# Print results
model.get_top_tokens(5)

[['rendering', 'better', 'typing', 'cfg', 'setup'],
 ['new', 'column', 'drive', 'output', 'data'],
 ['completion', 'project', 'view', 'debug', 'code'],
 ['interpreter', 'remote', 'docker', 'path', 'package'],
 ['wsl', 'debugger', 'interpreter', 'based', 'wsl2'],
 ['type', 'wrong', 'inferred', 'does', 'method'],
 ['interpreter', 'configuration', 'existing', 'background', 'diff'],
 ['dataframe', 'column', 'sciview', 'project', 'coverage'],
 ['classes', 'unexpected', 'completion', 'django', 'style'],
 ['numpy', 'block', '20', 'freezes', 'opening'],
 ['super', 'project', 'new', 'breaks', 'pycharm'],
 ['file', 'directory', 'suggest', 'add', 'import'],
 ['reference', 'unresolved', 'jupyter', 'notebooks', 'pycharm'],
 ['reset', 'connection', 'request', 'new', 'pycharm'],
 ['files', 'satisfied', 'python', 'output', 'plugin'],
 ['build', 'working', 'django', 'message', 'highlighting'],
 ['python', 'running', 'windows', 'pycharm', 'debugger'],
 ['string', 'crashes', 'mode', 'wrongly', 'pycharm']