In [2]:
# replaces Paper-Testset-Data-ANALZE
# prepare data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, json
import glob
import operator

from IPython.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)

In [3]:
%load_ext autoreload
%autoreload 1

from shared_globals_functions import *
import shared_globals_functions as sgf
%aimport shared_globals_functions

In [4]:
# do this only on the first run

def first_run():
    github_data_df = pd.DataFrame()

    json_pattern = os.path.join(github_data_folder,'*.json')
    file_list = glob.glob(json_pattern)

    dfs = [] # an empty list to store the data frames
    for file in file_list:
        data = pd.read_json(file, lines=True) # read data frame from json file
        dfs.append(data) # append the data frame to the list

    github_data_df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.

    to_disk(github_data_df[['name', 'isfork', 'stars', 'archived', 'commits', 'watcher', 'license',
                   'releases', 'forks', 'pushed', 'created', 'wiki', 'downloads',
                   'open_issues', 'open_issues_s30d', 'open_issues_b30d',
                   'language', 'size', 'url', 'contributors',
                   'content_url', 'crawl_created', 'crawl_last_updated',
                   'crawl_last_spare_req', 'search_queries']], 'init_short') #, 'about_is_english',               'readme_is_english'

    to_disk(github_data_df, 'init')

In [5]:
github_data_df = from_disk('init')

reading /home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/init.pkl


In [6]:
github_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221971 entries, 0 to 221970
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   name                  221971 non-null  object
 1   isfork                221971 non-null  bool  
 2   stars                 221971 non-null  int64 
 3   archived              221971 non-null  bool  
 4   commits               221971 non-null  int64 
 5   watcher               221971 non-null  int64 
 6   license               125322 non-null  object
 7   releases              221971 non-null  int64 
 8   forks                 221971 non-null  int64 
 9   pushed                221971 non-null  object
 10  created               221971 non-null  object
 11  wiki                  221971 non-null  bool  
 12  downloads             221971 non-null  bool  
 13  open_issues           221971 non-null  int64 
 14  open_issues_s30d      221971 non-null  int64 
 15  open_issues_b30d 

In [7]:
from collections import Counter
topic_counter = Counter(github_data_df['topics'].explode().to_list())

In [8]:
total_repo_cnt = len(github_data_df)
print(f"Total number of repos {total_repo_cnt}")

Total number of repos 221971


In [9]:
# percentage of repos with topics
(len(github_data_df[github_data_df['topics'].str.len() > 0])) / total_repo_cnt 

0.31453207851476095

In [10]:
# number of repos without readme
1-(len(github_data_df['readme'][github_data_df['readme'].str.len() > 0])) / total_repo_cnt 

0.011050993147753485

In [11]:
# number of repos with about
(len(github_data_df['about'][github_data_df['about'].str.len() > 0])) / total_repo_cnt 

0.8779390100508625

In [12]:
# number of unique topics
len(topic_counter.keys())

93055

In [13]:
total_repos_with_topics=len(github_data_df[github_data_df['topics'].str.len() > 0])

In [14]:
## we build topics and topics_composite, that are lists of topics which are represented in at least X repositories; this is to reduce topics that are rarely used and to reduce the memory footprint
topic_composite = [] # save topic compositions
topics = []

min_thres=total_repos_with_topics*0.0005 # at least in 5% of the repos
print(f'there are {total_repos_with_topics} repos that have topics, we take topics that are in minimium of {min_thres} in our topic list')

for i in topic_counter.items():
    t = i[0]
    o = i[1]
    if isinstance(t, str):
        if not t.isascii():
            print(f'string is not ascii: {t}')
        if any(not c.isalnum() for c in t):
            #print(f'string contains non alphanumeric values: {t}')
            if o >= min_thres:             
                topic_composite.append(t)
        if o >= min_thres:
                topics.append(t)


there are 69817 repos that have topics, we take topics that are in minimium of 34.908500000000004 in our topic list


In [15]:
# topics that are at least in X repos
topics

['wordpress',
 'wordpress-plugin',
 'language',
 'amazon',
 'chrome-extension',
 'swift',
 'ios',
 'custom',
 'cnn',
 'cnn-keras',
 'tensorflow',
 'ai',
 'machine-learning',
 'android',
 'security',
 'malware',
 'malware-analysis',
 'trojan',
 'ransomware',
 'engine',
 'static-analysis',
 'static-code-analysis',
 'dynamic-analysis',
 'html5',
 'css3',
 'python3',
 'mongodb',
 'flask-application',
 'heroku',
 'bootstrap4',
 'landmark-detection',
 'computer-vision',
 'pentest',
 'pentest-tool',
 'penetration-testing',
 'bugbounty',
 'enumeration',
 'osint',
 'brute-force',
 'password',
 'fast',
 'raspberry-pi',
 'media',
 'docker-compose',
 'docker',
 'grafana',
 'prometheus',
 'real-time',
 'instance-segmentation',
 'mask-rcnn',
 'webcam',
 'rcnn',
 'image-segmentation',
 'keras',
 'opencv',
 'object-detection',
 'network-analysis',
 'community-detection',
 'graph-algorithms',
 'gcp',
 'google-cloud',
 'google-cloud-platform',
 'terraform',
 'python',
 'optimization',
 'deep-learning',


In [16]:
# topics compositions that are at least in X repos
topic_composite

['wordpress-plugin',
 'chrome-extension',
 'cnn-keras',
 'machine-learning',
 'malware-analysis',
 'static-analysis',
 'static-code-analysis',
 'dynamic-analysis',
 'flask-application',
 'landmark-detection',
 'computer-vision',
 'pentest-tool',
 'penetration-testing',
 'brute-force',
 'raspberry-pi',
 'docker-compose',
 'real-time',
 'instance-segmentation',
 'mask-rcnn',
 'image-segmentation',
 'object-detection',
 'network-analysis',
 'community-detection',
 'graph-algorithms',
 'google-cloud',
 'google-cloud-platform',
 'deep-learning',
 'reverse-engineering',
 'awesome-list',
 'question-answering',
 'antivirus-evasion',
 'react-redux',
 'opencv-python',
 'mobilenet-ssd',
 'jupyter-notebook',
 'machine-learning-algorithms',
 'neural-network',
 'reinforcement-learning',
 'supervised-learning',
 'unsupervised-learning',
 'linear-regression',
 'logistic-regression',
 'medical-imaging',
 'deep-reinforcement-learning',
 'meta-learning',
 'faster-rcnn',
 'wifi-security',
 'discord-bot',


In [17]:
len(topics)

1659

In [18]:
len(topic_composite)

538

In [19]:
# we combine split words to not loose any information
topics_combined = []
for t in topics:
    newt = ''.join(t.split('-'))
    if newt not in topics_combined:
        topics_combined.append(newt)

In [20]:
# this operation is very intense, run it only once
def clean_readme():
    readme_clean=[]
    for i in github_data_df[['name', 'readme']].itertuples(index=False, name=None):
        try:
            readme_clean.append(clean_text(markdown_to_text(i[1])))
        except Exception as e:
            print(f'{i[0]} has readme f{i[1]}')
            print(e)
            readme_clean.append(np.nan)

    github_data_df['readme_clean'] = readme_clean
    
    to_disk(github_data_df, 'init')

In [21]:
len(github_data_df[github_data_df['readme_clean'].str.len()>0])

218425

In [22]:
len(github_data_df[github_data_df['readme'].str.len()>0])

219518

## Testset 1 (D_T1)
   * in: name, topics (as list)
   * out: name, topics (as string)
   * All Github Data with topics

In [23]:
topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [24]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(x))

In [44]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

93053

In [51]:
to_disk(github_data_wtopic, 't1')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.pkl


In [52]:
from collections import Counter
word_cnt_t1 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1.update(t.split(' '))


word_cnt_t1.most_common()

[('python', 6323),
 ('deep-learning', 4953),
 ('machine-learning', 4521),
 ('computer-vision', 2953),
 ('javascript', 2744),
 ('security', 2656),
 ('object-detection', 2569),
 ('pytorch', 2166),
 ('tensorflow', 2123),
 ('opencv', 2042),
 ('hacktoberfest', 2010),
 ('android', 1676),
 ('python3', 1660),
 ('docker', 1656),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1273),
 ('aws', 1264),
 ('golang', 1251),
 ('kubernetes', 1223),
 ('awesome-list', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('monitoring', 1126),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('image-processing', 986),
 ('typescript', 971),
 ('face-detection', 939),
 ('keras', 871),
 ('windows', 848),
 ('nlp', 823),
 ('api', 800),
 ('go', 782),
 ('data-science', 756),
 ('artificial-intelligence', 733),
 ('swift', 721),
 ('convolutional-neural-networks', 704),
 ('cybersecurity', 694),
 ('anomaly-detection', 685),
 ('cnn', 683),
 ('ai', 672),
 ('cpp', 660),
 ('security-tools', 653),
 ('neural-network', 648),
 ('

In [45]:
github_data_wtopic = np.nan

## Testset 1.1
   * in: name, topics 
   * out: name, topics(preprocessed, lemmatized)

In [46]:
sgf.topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [47]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(preprocess_text(' '.join(x), lem=True)))

In [48]:
Counter(sgf.topics_removed).most_common()

[('of', 661),
 ('to', 578),
 ('and', 335),
 ('in', 232),
 ('for', 228),
 ('as', 187),
 ('the', 167),
 ('on', 155),
 ('by', 122),
 ('no', 109),
 ('few', 103),
 ('out', 83),
 ('only', 79),
 ('a', 64),
 ('over', 59),
 ('d', 58),
 ('re', 57),
 ('with', 55),
 ('from', 42),
 ('t', 36),
 ('up', 34),
 ('can', 32),
 ('it', 32),
 ('m', 30),
 ('at', 28),
 ('all', 24),
 ('doing', 24),
 ('i', 23),
 ('your', 23),
 ('s', 22),
 ('now', 19),
 ('how', 18),
 ('me', 17),
 ('not', 17),
 ('is', 17),
 ('my', 17),
 ('after', 14),
 ('do', 14),
 ('you', 13),
 ('off', 11),
 ('o', 11),
 ('down', 11),
 ('ve', 9),
 ('did', 9),
 ('other', 9),
 ('this', 8),
 ('about', 7),
 ('who', 7),
 ('once', 6),
 ('be', 5),
 ('under', 5),
 ('here', 5),
 ('any', 5),
 ('an', 5),
 ('have', 4),
 ('been', 4),
 ('own', 4),
 ('will', 4),
 ('just', 4),
 ('through', 3),
 ('same', 3),
 ('if', 3),
 ('what', 3),
 ('into', 2),
 ('while', 2),
 ('then', 2),
 ('he', 2),
 ('myself', 2),
 ('more', 2),
 ('against', 2),
 ('am', 2),
 ('its', 2),
 ('ve

In [49]:
github_data_wtopic

Unnamed: 0,name,topics
6,thingsym/multi-device-switcher,wordpress wordpress plugin
8,redlink-gmbh/solr-compound-word-filter,solr lucene lucene analyzer language
9,xswlme/Delivery-Availability-Checker,amazon chrome extension food delivery grocery ...
10,eastsss/ErrorDispatching,swift ios error handle custom chain responsibi...
11,ahmadhuzaifa/skinskan,cnn cnn keras skin cancer tensorflow ai machin...
...,...,...
221952,waqasbhatti/astrobase,light curve astronomy python variable star
221955,hpcc-systems/vscode-ecl,vscode typescript ecl hpcc platform
221965,MarcoPon/BlockHashLoc,bhl recovery hash block metadata undelete unfo...
221966,bhagatabhijeet/bhagatabhijeet,bhagatabhijeet abhijeetbhagat abhijeet bhagat ...


In [50]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

43131

In [72]:
to_disk(github_data_wtopic, 't1.1')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.1.pkl


In [73]:
from collections import Counter
word_cnt_t1_1 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1_1.update(t.split(' '))


word_cnt_t1_1.most_common()

[('detection', 13503),
 ('learn', 13192),
 ('python', 7897),
 ('deep', 6053),
 ('machine', 5861),
 ('security', 5645),
 ('network', 5319),
 ('image', 4566),
 ('data', 4467),
 ('object', 3839),
 ('api', 3718),
 ('vision', 3611),
 ('computer', 3399),
 ('analysis', 3267),
 ('javascript', 3218),
 ('android', 3063),
 ('neural', 3024),
 ('tool', 3002),
 ('face', 2934),
 ('tensorflow', 2893),
 ('recognition', 2801),
 ('process', 2793),
 ('react', 2770),
 ('awesome', 2753),
 ('opencv', 2686),
 ('aws', 2554),
 ('pytorch', 2516),
 ('docker', 2462),
 ('monitor', 2392),
 ('cloud', 2160),
 ('hacktoberfest', 2133),
 ('model', 2017),
 ('classification', 2012),
 ('library', 1942),
 ('plugin', 1902),
 ('list', 1902),
 ('java', 1869),
 ('game', 1732),
 ('language', 1707),
 ('test', 1687),
 ('python3', 1679),
 ('kubernetes', 1672),
 ('linux', 1633),
 ('azure', 1627),
 ('segmentation', 1614),
 ('web', 1554),
 ('bot', 1538),
 ('server', 1483),
 ('framework', 1475),
 ('malware', 1439),
 ('windows', 1400),
 

In [51]:
github_data_wtopic = np.nan

## Testset 1.2 (D_T3)
```
in: name, topics
out: name, topics (tokeinzed not lematized)
```

In [52]:
sgf.topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [53]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(preprocess_text(' '.join(x), lem=False)))

In [54]:
Counter(sgf.topics_removed).most_common()

[('of', 661),
 ('to', 578),
 ('and', 335),
 ('in', 232),
 ('for', 228),
 ('as', 187),
 ('the', 167),
 ('on', 155),
 ('by', 122),
 ('no', 109),
 ('few', 103),
 ('out', 83),
 ('only', 79),
 ('a', 64),
 ('over', 59),
 ('d', 58),
 ('re', 57),
 ('with', 55),
 ('from', 42),
 ('t', 36),
 ('up', 34),
 ('can', 32),
 ('it', 32),
 ('m', 30),
 ('at', 28),
 ('all', 24),
 ('doing', 24),
 ('i', 23),
 ('your', 23),
 ('s', 22),
 ('now', 19),
 ('how', 18),
 ('me', 17),
 ('not', 17),
 ('is', 17),
 ('my', 17),
 ('after', 14),
 ('do', 14),
 ('you', 13),
 ('off', 11),
 ('o', 11),
 ('down', 11),
 ('ve', 9),
 ('did', 9),
 ('other', 9),
 ('this', 8),
 ('about', 7),
 ('who', 7),
 ('once', 6),
 ('be', 5),
 ('under', 5),
 ('here', 5),
 ('any', 5),
 ('an', 5),
 ('have', 4),
 ('been', 4),
 ('own', 4),
 ('will', 4),
 ('just', 4),
 ('through', 3),
 ('same', 3),
 ('if', 3),
 ('what', 3),
 ('into', 2),
 ('while', 2),
 ('then', 2),
 ('he', 2),
 ('myself', 2),
 ('more', 2),
 ('against', 2),
 ('am', 2),
 ('its', 2),
 ('ve

In [55]:
github_data_wtopic

Unnamed: 0,name,topics
6,thingsym/multi-device-switcher,wordpress wordpress plugin
8,redlink-gmbh/solr-compound-word-filter,solr lucene lucene analyzer language
9,xswlme/Delivery-Availability-Checker,amazon chrome extension food delivery grocery ...
10,eastsss/ErrorDispatching,swift ios error handling custom chain responsi...
11,ahmadhuzaifa/skinskan,cnn cnn keras skin cancer tensorflow ai machin...
...,...,...
221952,waqasbhatti/astrobase,light curves astronomy python variable stars
221955,hpcc-systems/vscode-ecl,vscode typescript ecl hpcc platform
221965,MarcoPon/BlockHashLoc,bhl recovery hash block metadata undelete unfo...
221966,bhagatabhijeet/bhagatabhijeet,bhagatabhijeet abhijeetbhagat abhijeet bhagat ...


In [56]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

44924

In [57]:
to_disk(github_data_wtopic, 't1.2')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.2.pkl


In [58]:
from collections import Counter
word_cnt_t1_2 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1_2.update(t.split(' '))


word_cnt_t1_2.most_common()

[('detection', 13503),
 ('learning', 12881),
 ('python', 7897),
 ('deep', 6053),
 ('machine', 5800),
 ('security', 5645),
 ('data', 4467),
 ('image', 3795),
 ('object', 3785),
 ('api', 3718),
 ('vision', 3611),
 ('computer', 3399),
 ('analysis', 3267),
 ('javascript', 3218),
 ('android', 3063),
 ('neural', 3024),
 ('tensorflow', 2893),
 ('face', 2878),
 ('recognition', 2801),
 ('react', 2770),
 ('awesome', 2753),
 ('opencv', 2686),
 ('network', 2587),
 ('processing', 2563),
 ('aws', 2554),
 ('pytorch', 2516),
 ('docker', 2462),
 ('networks', 2454),
 ('hacktoberfest', 2133),
 ('cloud', 2131),
 ('classification', 2012),
 ('library', 1942),
 ('plugin', 1902),
 ('java', 1869),
 ('monitoring', 1847),
 ('tools', 1839),
 ('language', 1707),
 ('list', 1688),
 ('python3', 1679),
 ('kubernetes', 1672),
 ('linux', 1633),
 ('azure', 1627),
 ('segmentation', 1614),
 ('web', 1554),
 ('bot', 1538),
 ('game', 1490),
 ('server', 1483),
 ('framework', 1475),
 ('malware', 1439),
 ('windows', 1400),
 ('io

In [59]:
github_data_wtopic = np.nan

## Testset 1.3.1 (D_T4)
```
in: name, topics
out: name, topics (tokeinzed not lematized, but w composition)

there was an error 1.3 shouldnt be lematized, so 1.3 is now saved as lemmatized and 1.3.1 is not lemma
```

In [60]:
sgf.topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [61]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(preprocess_text(' '.join(x), lem=False, do_composition=True)))

In [62]:
Counter(sgf.topics_removed).most_common()

[('d', 10),
 ('can', 9),
 ('did', 9),
 ('re', 7),
 ('it', 7),
 ('i', 5),
 ('a', 5),
 ('all', 4),
 ('up', 4),
 ('and', 4),
 ('will', 4),
 ('for', 4),
 ('this', 3),
 ('of', 3),
 ('who', 3),
 ('s', 3),
 ('from', 2),
 ('about', 2),
 ('myself', 2),
 ('the', 2),
 ('its', 2),
 ('same', 2),
 ('any', 2),
 ('is', 2),
 ('m', 2),
 ('to', 2),
 ('other', 2),
 ('now', 2),
 ('ll', 1),
 ('nor', 1),
 ('y', 1),
 ('them', 1),
 ('over', 1),
 ('down', 1),
 ('more', 1),
 ('isn', 1),
 ('at', 1),
 ('your', 1),
 ('with', 1),
 ('on', 1),
 ('off', 1),
 ('t', 1),
 ('what', 1),
 ('does', 1),
 ('an', 1),
 ('my', 1),
 ('ve', 1),
 ('do', 1),
 ('no', 1),
 ('here', 1),
 ('why', 1)]

In [63]:
github_data_wtopic

Unnamed: 0,name,topics
6,thingsym/multi-device-switcher,wordpress wordpressplugin
8,redlink-gmbh/solr-compound-word-filter,solr lucene luceneanalyzer language
9,xswlme/Delivery-Availability-Checker,amazon chromeextension fooddelivery grocerydel...
10,eastsss/ErrorDispatching,swift ios errorhandling custom chainofresponsi...
11,ahmadhuzaifa/skinskan,cnn cnnkeras skincancer tensorflow ai machinel...
...,...,...
221952,waqasbhatti/astrobase,lightcurves astronomy python variablestars
221955,hpcc-systems/vscode-ecl,vscode typescript ecl hpccplatform
221965,MarcoPon/BlockHashLoc,bhl recovery hash block metadata undelete unfo...
221966,bhagatabhijeet/bhagatabhijeet,bhagatabhijeet abhijeetbhagat abhijeetbhagat s...


In [64]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

89406

In [86]:
github_data_wtopic[github_data_wtopic['topics'].str.len() > 0]

Unnamed: 0,name,topics
6,thingsym/multi-device-switcher,wordpress wordpressplugin
8,redlink-gmbh/solr-compound-word-filter,solr lucene luceneanalyzer language
9,xswlme/Delivery-Availability-Checker,amazon chromeextension fooddelivery grocerydel...
10,eastsss/ErrorDispatching,swift ios errorhandling custom chainofresponsi...
11,ahmadhuzaifa/skinskan,cnn cnnkeras skincancer tensorflow ai machinel...
...,...,...
221952,waqasbhatti/astrobase,lightcurves astronomy python variablestars
221955,hpcc-systems/vscode-ecl,vscode typescript ecl hpccplatform
221965,MarcoPon/BlockHashLoc,bhl recovery hash block metadata undelete unfo...
221966,bhagatabhijeet/bhagatabhijeet,bhagatabhijeet abhijeetbhagat abhijeetbhagat s...


In [87]:
to_disk(github_data_wtopic, 't1.3.1')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.3.1.pkl


In [88]:
from collections import Counter
word_cnt_t1_2 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1_2.update(t.split(' '))


word_cnt_t1_2.most_common()

[('python', 6323),
 ('deeplearning', 5300),
 ('machinelearning', 4665),
 ('computervision', 3010),
 ('javascript', 2744),
 ('security', 2657),
 ('objectdetection', 2617),
 ('pytorch', 2166),
 ('tensorflow', 2125),
 ('opencv', 2055),
 ('hacktoberfest', 2010),
 ('python3', 1781),
 ('android', 1676),
 ('docker', 1658),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1373),
 ('aws', 1264),
 ('golang', 1252),
 ('kubernetes', 1223),
 ('awesomelist', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('monitoring', 1126),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('imageprocessing', 1000),
 ('facedetection', 987),
 ('typescript', 971),
 ('keras', 871),
 ('windows', 848),
 ('datascience', 836),
 ('nlp', 823),
 ('cybersecurity', 819),
 ('api', 802),
 ('go', 782),
 ('artificialintelligence', 738),
 ('anomalydetection', 738),
 ('raspberrypi', 736),
 ('swift', 721),
 ('convolutionalneuralnetworks', 704),
 ('cnn', 698),
 ('ai', 672),
 ('cpp', 660),
 ('neuralnetwork', 653),
 ('securitytools',

In [89]:
github_data_wtopic = np.nan

## Testset 1.3 (D_T5)
```
in: name, topics
out: name, topics (tokeinzed not lematized, but w composition)

there was an error 1.3 shouldnt be lematized, so 1.3 is now saved as lemmatized and 1.3.1 is not lemma
```

In [65]:
sgf.topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [66]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(preprocess_text(' '.join(x), lem=True, do_composition=True)))

In [67]:
Counter(sgf.topics_removed).most_common()

[('d', 10),
 ('can', 9),
 ('did', 9),
 ('re', 7),
 ('it', 7),
 ('i', 5),
 ('a', 5),
 ('all', 4),
 ('up', 4),
 ('and', 4),
 ('will', 4),
 ('for', 4),
 ('this', 3),
 ('of', 3),
 ('who', 3),
 ('s', 3),
 ('from', 2),
 ('about', 2),
 ('myself', 2),
 ('the', 2),
 ('its', 2),
 ('same', 2),
 ('any', 2),
 ('is', 2),
 ('m', 2),
 ('to', 2),
 ('other', 2),
 ('now', 2),
 ('ll', 1),
 ('nor', 1),
 ('y', 1),
 ('them', 1),
 ('over', 1),
 ('down', 1),
 ('more', 1),
 ('isn', 1),
 ('at', 1),
 ('your', 1),
 ('with', 1),
 ('on', 1),
 ('off', 1),
 ('t', 1),
 ('what', 1),
 ('does', 1),
 ('an', 1),
 ('my', 1),
 ('ve', 1),
 ('do', 1),
 ('no', 1),
 ('here', 1),
 ('why', 1)]

In [68]:
github_data_wtopic

Unnamed: 0,name,topics
6,thingsym/multi-device-switcher,wordpress wordpressplugin
8,redlink-gmbh/solr-compound-word-filter,solr lucene luceneanalyzer language
9,xswlme/Delivery-Availability-Checker,amazon chromeextension fooddelivery grocerydel...
10,eastsss/ErrorDispatching,swift ios errorhandling custom chainofresponsi...
11,ahmadhuzaifa/skinskan,cnn cnnkeras skincancer tensorflow ai machinel...
...,...,...
221952,waqasbhatti/astrobase,lightcurves astronomy python variablestars
221955,hpcc-systems/vscode-ecl,vscode typescript ecl hpccplatform
221965,MarcoPon/BlockHashLoc,bhl recovery hash block metadata undelete unfo...
221966,bhagatabhijeet/bhagatabhijeet,bhagatabhijeet abhijeetbhagat abhijeetbhagat s...


In [69]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

88364

In [94]:
to_disk(github_data_wtopic, 't1.3')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.3.pkl


In [95]:
from collections import Counter
word_cnt_t1_2 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1_2.update(t.split(' '))


word_cnt_t1_2.most_common()

[('python', 6323),
 ('deeplearning', 5300),
 ('machinelearning', 4665),
 ('computervision', 3010),
 ('javascript', 2744),
 ('security', 2657),
 ('objectdetection', 2617),
 ('pytorch', 2166),
 ('tensorflow', 2125),
 ('opencv', 2055),
 ('hacktoberfest', 2010),
 ('python3', 1781),
 ('android', 1676),
 ('docker', 1658),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1373),
 ('monitor', 1285),
 ('aws', 1264),
 ('golang', 1252),
 ('kubernetes', 1223),
 ('awesomelist', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('imageprocessing', 1000),
 ('facedetection', 987),
 ('typescript', 971),
 ('keras', 871),
 ('windows', 848),
 ('datascience', 836),
 ('nlp', 823),
 ('cybersecurity', 819),
 ('api', 802),
 ('go', 783),
 ('artificialintelligence', 738),
 ('anomalydetection', 738),
 ('alert', 737),
 ('raspberrypi', 736),
 ('hack', 727),
 ('swift', 721),
 ('convolutionalneuralnetworks', 704),
 ('cnn', 698),
 ('ai', 672),
 ('cpp', 660),
 ('neuralnetw

In [70]:
github_data_wtopic = np.nan

## Testset 1.4 (D_T6)
```
in: name, topics
out: name, topics (tokeinzed not lematized, but w composition and filter)
```

In [71]:
sgf.topics_removed = []
github_data_wtopic = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()

In [72]:
github_data_wtopic['topics'] = github_data_wtopic['topics'].apply(lambda x: ' '.join(preprocess_text(' '.join(x), lem=True, do_composition=True, token_list=topics_combined)))

In [73]:
Counter(sgf.topics_removed).most_common()

[('d', 10),
 ('can', 9),
 ('did', 9),
 ('re', 7),
 ('it', 7),
 ('i', 5),
 ('a', 5),
 ('all', 4),
 ('up', 4),
 ('and', 4),
 ('will', 4),
 ('for', 4),
 ('this', 3),
 ('of', 3),
 ('who', 3),
 ('s', 3),
 ('from', 2),
 ('about', 2),
 ('myself', 2),
 ('the', 2),
 ('its', 2),
 ('same', 2),
 ('any', 2),
 ('is', 2),
 ('m', 2),
 ('to', 2),
 ('other', 2),
 ('now', 2),
 ('ll', 1),
 ('nor', 1),
 ('y', 1),
 ('them', 1),
 ('over', 1),
 ('down', 1),
 ('more', 1),
 ('isn', 1),
 ('at', 1),
 ('your', 1),
 ('with', 1),
 ('on', 1),
 ('off', 1),
 ('t', 1),
 ('what', 1),
 ('does', 1),
 ('an', 1),
 ('my', 1),
 ('ve', 1),
 ('do', 1),
 ('no', 1),
 ('here', 1),
 ('why', 1)]

In [74]:
github_data_wtopic[github_data_wtopic['topics'].str.len() == 0]

Unnamed: 0,name,topics
42,LuisKay/Spec_ResNet,
57,robo-ai/roboai-python-cli,
93,ivankunyankin/intent_suggestions,
211,usnistgov/PrivacyEngCollabSpace,
248,MadDeCoDeR/Classic-RBDOOM-3-BFG,
...,...,...
221897,HAC-2020/Cookie-Army,
221904,Snausage0x45/KapeStrike,
221909,lupyuen/nuttx,
221936,cenidetiot/ngsi-parser,


In [75]:
github_data_wtopic = github_data_wtopic[github_data_wtopic['topics'].str.len() > 0]

In [76]:
len(set(github_data_wtopic['topics'].apply(lambda x: x.split(' ')).explode()))

1567

In [103]:
to_disk(github_data_wtopic, 't1.4')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t1.4.pkl


In [104]:
from collections import Counter
word_cnt_t1_2 = Counter()
i = 0
for t in github_data_wtopic['topics']:
    i += 1
    word_cnt_t1_2.update(t.split(' '))


word_cnt_t1_2.most_common()

[('python', 6323),
 ('deeplearning', 5300),
 ('machinelearning', 4665),
 ('computervision', 3010),
 ('javascript', 2744),
 ('security', 2657),
 ('objectdetection', 2617),
 ('pytorch', 2166),
 ('tensorflow', 2125),
 ('opencv', 2055),
 ('hacktoberfest', 2010),
 ('python3', 1781),
 ('android', 1676),
 ('docker', 1658),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1373),
 ('monitor', 1285),
 ('aws', 1264),
 ('golang', 1252),
 ('kubernetes', 1223),
 ('awesomelist', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('imageprocessing', 1000),
 ('facedetection', 987),
 ('typescript', 971),
 ('keras', 871),
 ('windows', 848),
 ('datascience', 836),
 ('nlp', 823),
 ('cybersecurity', 819),
 ('api', 802),
 ('go', 783),
 ('artificialintelligence', 738),
 ('anomalydetection', 738),
 ('alert', 737),
 ('raspberrypi', 736),
 ('hack', 727),
 ('swift', 721),
 ('convolutionalneuralnetworks', 704),
 ('cnn', 698),
 ('ai', 672),
 ('cpp', 660),
 ('neuralnetw

In [105]:
github_data_wtopic = np.nan

## Testset 2.5 (Part of D_C1)
The best performing 1 set was 1.2; so we do the same we did with 1.2 on readme and about

we set a **min_thres** here, that means this topic has to be at least in 4 repos

In [53]:
topics_combined_full = []
min_thres = 4
for i in topic_counter.items():
    t = i[0]
    o = i[1]
    t = str(t)
    if t == 'nan':
        continue
    newt = preprocess_text(t, lem=False, do_composition=False)
    if newt not in topics_combined_full:
        if o >= min_thres:
            topics_combined_full+=newt


In [54]:
len(topics_combined_full)

23732

In [55]:
len(set(topics_combined_full))

9652

In [56]:
topics_combined_full = list(set(topics_combined_full))

In [57]:
topics_combined_full

['cloudrun',
 'angular8',
 'icloud',
 'rabbitmq',
 'iac',
 'rpi',
 'circleci',
 'manifold',
 'warframe',
 'awareness',
 'fairness',
 'cmake',
 'tooltip',
 'sqlite3',
 'lane',
 'capybara',
 'trading',
 'java',
 'gpuimage',
 'cars',
 'atomic',
 'tendermint',
 'windows11',
 'postgres',
 'nim',
 'factorization',
 'syn',
 'sdn',
 'oidc',
 'cctv',
 'listen',
 'thought',
 'menubar',
 'defect',
 'decryption',
 'smt',
 'metatrader',
 'codereview',
 'rs485',
 'godot4',
 'clahe',
 'obfuscator',
 'switzerland',
 'apikey',
 'azuread',
 'husky',
 'carthage',
 '064b0s2',
 'gamepad',
 'compilers',
 'elgato',
 'tinyml',
 'semgrep',
 'texttospeech',
 'm5stack',
 'neat',
 'rumor',
 'apriltag',
 'gsm',
 'journey',
 'assistance',
 'paddlepaddle',
 'lemmatizer',
 'nashorn',
 'redisai',
 'fuzzer',
 'interview',
 'wisconsin',
 'e2ee',
 'callback',
 'measurement',
 'feedforward',
 'invoice',
 'steganography',
 'polly',
 'jsonnet',
 'res2net',
 'paint',
 'stomp',
 'hana',
 'garden',
 'inceptionv2',
 'abac',
 'b

In [58]:
def myclnfunction(x):
    return preprocess_text(x, lem=False, token_list = topics_combined_full, do_composition=False)

In [59]:
t1 = github_data_df['name'].apply(lambda x: myclnfunction(x))

In [60]:
t1_cnt = Counter(t1.explode())
t1_cnt.most_common()

[(nan, 87911),
 ('detection', 10383),
 ('awesome', 5841),
 ('aws', 2876),
 ('learning', 2736),
 ('react', 2658),
 ('project', 1840),
 ('python', 1762),
 ('js', 1753),
 ('plugin', 1666),
 ('using', 1625),
 ('object', 1518),
 ('face', 1479),
 ('security', 1464),
 ('api', 1410),
 ('app', 1390),
 ('deep', 1389),
 ('ai', 1374),
 ('data', 1347),
 ('samples', 1308),
 ('android', 1276),
 ('docker', 1261),
 ('system', 1173),
 ('machine', 1161),
 ('native', 1139),
 ('ibm', 1134),
 ('image', 1097),
 ('recognition', 1060),
 ('interview', 1028),
 ('node', 987),
 ('azure', 955),
 ('pytorch', 933),
 ('web', 932),
 ('demo', 916),
 ('cloud', 914),
 ('lab', 890),
 ('detector', 876),
 ('io', 876),
 ('tools', 866),
 ('code', 859),
 ('terraform', 831),
 ('sdk', 819),
 ('ml', 803),
 ('github', 799),
 ('alert', 798),
 ('vision', 785),
 ('google', 771),
 ('go', 762),
 ('analysis', 761),
 ('ios', 755),
 ('opencv', 744),
 ('php', 725),
 ('dev', 705),
 ('server', 688),
 ('microsoft', 686),
 ('java', 680),
 ('bas

In [61]:
t2 = github_data_df['topics'].apply(lambda x: myclnfunction(x))

In [62]:
t2_cnt = Counter(t2.explode())
t2_cnt.most_common()

[(nan, 152648),
 ('detection', 13503),
 ('learning', 12881),
 ('python', 7897),
 ('deep', 6053),
 ('machine', 5800),
 ('security', 5645),
 ('data', 4467),
 ('image', 3795),
 ('object', 3785),
 ('api', 3718),
 ('vision', 3611),
 ('computer', 3399),
 ('analysis', 3267),
 ('javascript', 3218),
 ('android', 3063),
 ('neural', 3024),
 ('tensorflow', 2893),
 ('face', 2878),
 ('recognition', 2801),
 ('react', 2770),
 ('awesome', 2753),
 ('opencv', 2686),
 ('network', 2587),
 ('processing', 2563),
 ('aws', 2554),
 ('pytorch', 2516),
 ('docker', 2462),
 ('networks', 2454),
 ('hacktoberfest', 2133),
 ('cloud', 2131),
 ('classification', 2012),
 ('library', 1942),
 ('plugin', 1902),
 ('java', 1869),
 ('monitoring', 1847),
 ('tools', 1839),
 ('language', 1707),
 ('list', 1688),
 ('python3', 1679),
 ('kubernetes', 1672),
 ('linux', 1633),
 ('azure', 1627),
 ('segmentation', 1614),
 ('web', 1554),
 ('bot', 1538),
 ('game', 1490),
 ('server', 1483),
 ('framework', 1475),
 ('malware', 1439),
 ('window

In [63]:
t3 = github_data_df['about'].apply(lambda x: myclnfunction(x))

In [64]:
t3_cnt = Counter(t3.explode())
t3_cnt.most_common()

[(nan, 32273),
 ('detection', 26104),
 ('using', 18121),
 ('data', 13173),
 ('based', 11594),
 ('learning', 10499),
 ('1', 9512),
 ('code', 9463),
 ('project', 8616),
 ('python', 8263),
 ('library', 7875),
 ('list', 7230),
 ('system', 6696),
 ('web', 6552),
 ('set', 6463),
 ('api', 6438),
 ('object', 6329),
 ('2', 6225),
 ('implementation', 6035),
 ('tool', 6006),
 ('app', 5877),
 ('application', 5587),
 ('simple', 5552),
 ('image', 5516),
 ('use', 5510),
 ('repository', 5349),
 ('c', 5316),
 ('js', 5296),
 ('source', 5254),
 ('time', 5226),
 ('https', 5148),
 ('open', 5010),
 ('network', 4975),
 ('deep', 4941),
 ('machine', 4722),
 ('framework', 4701),
 ('security', 4604),
 ('plugin', 4600),
 ('android', 4499),
 ('model', 4196),
 ('tools', 4143),
 ('github', 4092),
 ('awesome', 3971),
 ('3', 3957),
 ('analysis', 3927),
 ('resources', 3901),
 ('server', 3898),
 ('http', 3855),
 ('class', 3853),
 ('face', 3801),
 ('javascript', 3780),
 ('software', 3736),
 ('detect', 3642),
 ('images', 

In [65]:
t4 = github_data_df['languages'].apply(lambda x: myclnfunction(x))

In [66]:
t4_cnt = Counter(t4.explode())
t4_cnt.most_common()

[('python', 85119),
 ('c', 70739),
 ('shell', 57069),
 ('javascript', 49677),
 ('html', 40935),
 ('css', 29797),
 (nan, 27217),
 ('makefile', 24513),
 ('jupyter', 19789),
 ('notebook', 19789),
 ('dockerfile', 19663),
 ('java', 17412),
 ('ruby', 10653),
 ('objective', 10539),
 ('cmake', 10454),
 ('php', 9441),
 ('typescript', 8785),
 ('go', 8276),
 ('matlab', 6311),
 ('perl', 5473),
 ('powershell', 4618),
 ('cuda', 4374),
 ('swift', 3961),
 ('assembly', 3892),
 ('scss', 3615),
 ('r', 3512),
 ('kotlin', 2567),
 ('lua', 2543),
 ('tex', 2374),
 ('hcl', 2360),
 ('rust', 2280),
 ('m4', 2167),
 ('vue', 1890),
 ('scala', 1300),
 ('dart', 1199),
 ('coffeescript', 1191),
 ('glsl', 1181),
 ('groovy', 1175),
 ('tcl', 1098),
 ('awk', 1053),
 ('lisp', 1047),
 ('script', 958),
 ('vim', 942),
 ('jinja', 907),
 ('lex', 899),
 ('cython', 895),
 ('solidity', 839),
 ('asp', 810),
 ('less', 810),
 ('emacs', 769),
 ('tsql', 760),
 ('pascal', 718),
 ('basic', 676),
 ('gdb', 627),
 ('visual', 620),
 ('clojure

In [None]:
import swifter
t5 = github_data_df['readme_clean'].swifter.apply(lambda x: myclnfunction(x))

Pandas Apply:   0%|          | 0/221971 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [69]:
t5_cnt = Counter(t5.explode())
t5_cnt.most_common()

[('1', 1552767),
 ('data', 1166646),
 ('use', 1122852),
 ('2', 1032918),
 ('using', 903495),
 ('code', 868547),
 ('file', 866248),
 ('3', 736281),
 ('run', 649704),
 ('c', 587146),
 ('http', 577440),
 ('set', 563482),
 ('name', 558450),
 ('app', 557488),
 ('python', 531215),
 ('image', 520149),
 ('build', 514153),
 ('example', 503899),
 ('default', 491547),
 ('4', 487253),
 ('server', 486985),
 ('js', 485992),
 ('user', 484510),
 ('project', 482476),
 ('api', 475479),
 ('create', 472815),
 ('react', 458568),
 ('time', 454617),
 ('files', 450335),
 ('x', 447589),
 ('5', 444194),
 ('install', 439748),
 ('function', 438118),
 ('one', 430118),
 ('get', 427697),
 ('source', 422891),
 ('based', 422627),
 ('test', 417612),
 ('add', 415379),
 ('learning', 411990),
 ('system', 406250),
 ('value', 401150),
 ('support', 382163),
 ('version', 378021),
 ('type', 377257),
 ('web', 374821),
 ('library', 374242),
 ('list', 371092),
 ('model', 369961),
 ('object', 359431),
 ('10', 357256),
 ('open', 35

In [70]:
t6 = github_data_df['tree'].apply(lambda x: myclnfunction(x))

In [71]:
t6_cnt = Counter(t6.explode())
t6_cnt.most_common()

[('md', 325429),
 ('readme', 230494),
 ('py', 158959),
 ('license', 123935),
 ('json', 77162),
 ('txt', 72139),
 ('yml', 54300),
 ('js', 48160),
 ('sh', 45563),
 ('go', 40093),
 ('package', 40029),
 ('src', 39516),
 ('github', 37108),
 ('config', 35597),
 ('c', 35520),
 ('pdf', 34528),
 ('png', 32559),
 ('ipynb', 26794),
 ('requirements', 25366),
 ('test', 24770),
 ('lock', 24198),
 ('docs', 20499),
 ('makefile', 19584),
 ('setup', 19398),
 ('contributing', 18501),
 ('data', 18420),
 ('travis', 18286),
 ('php', 17592),
 ('build', 17538),
 ('cpp', 17525),
 ('gradle', 17415),
 ('tests', 17201),
 ('changelog', 16818),
 ('yaml', 16394),
 ('xml', 15856),
 ('images', 15573),
 ('html', 15360),
 ('lib', 14766),
 ('dockerfile', 14258),
 ('scripts', 13675),
 ('index', 13056),
 ('docker', 12295),
 ('app', 11872),
 ('examples', 11669),
 ('example', 10877),
 ('1', 10608),
 ('jpg', 10521),
 ('utils', 9981),
 ('editorconfig', 9065),
 ('tools', 8999),
 ('bat', 8628),
 ('assets', 8504),
 ('cfg', 8470),

### testset 2.5.1(s)
takes all 2.5 readme_clean extracted topics as set

In [73]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t5
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.1')

total repos 218378
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.1.pkl


In [74]:
ghd_exp['topics'] = ghd_exp['topics'].apply(lambda x: ' '.join(set(x)))

In [75]:
to_disk(ghd_exp, 't2.5.1s')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.1s.pkl


### testset 2.5.2
takes all 2.5 about extracted topics

In [76]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t3 #+t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.2')

total repos 189698
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.2.pkl


### testset 2.5.3
takes all 2.5 tree extracted topics

In [77]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t6
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.3')

total repos 221805
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.3.pkl


### testset 2.5.4
takes all 2.5 topics extracted topics

In [78]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.4')

total repos 69323
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.4.pkl


### testset 2.5.5
takes all 2.5 languages extracted topics

In [79]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t4
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.5')

total repos 194754
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.5.pkl


### testset 2.5.6
takes all 2.5 title extracted topics

In [80]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t1
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.5.6')

total repos 134060
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.5.6.pkl


## Testset 2.3 (Part of D_C2)
The best performing 1 set was 1.2; so we do the same we did with 1.2 on readme and about

we set a **min_thres** here, that means this topic has to be at least in 4 repos

In [22]:
topics_combined_full = []
min_thres = 4
for i in topic_counter.items():
    t = i[0]
    o = i[1]
    t = str(t)
    if t == 'nan':
        continue
    newt = preprocess_text(t, lem=True, do_composition=True)
    if newt not in topics_combined_full:
        if o >= min_thres:
            topics_combined_full+=newt


In [23]:
len(topics_combined_full)

15491

In [24]:
len(set(topics_combined_full))

14653

In [25]:
topics_combined_full = list(set(topics_combined_full))

In [26]:
topics_combined_full

['cloudrun',
 'angular8',
 'communitydetection',
 'hadoopecosystem',
 'eventstream',
 'icloud',
 'securityautomation',
 'rabbitmq',
 'iac',
 'confusionmatrix',
 'rpi',
 'circleci',
 'warframe',
 'awareness',
 'fairness',
 'cmake',
 'antdesignvue',
 'tooltip',
 'audiosignalprocessing',
 'sqlite3',
 'lane',
 'capybara',
 'encryptionalgorithm',
 'notificationcenter',
 'contextualdata',
 'loadtesting',
 'buildautomation',
 'java',
 'gpuimage',
 'idgenerator',
 'cars',
 'atomic',
 'interpretablemachinelearning',
 'snstopic',
 'windows11',
 'tendermint',
 'postgres',
 'nim',
 'factorization',
 'stockmarket',
 'resourceconstrainedml',
 'opentelemetrycollector',
 'sdn',
 'oidc',
 'cctv',
 'listen',
 'dllinjector',
 'menubar',
 'dockersecurity',
 'webaccessibility',
 'datastreamprocessing',
 'decryption',
 'smt',
 'codereview',
 'metatrader',
 'rs485',
 'peoplecounter',
 'godot4',
 'securityscan',
 'selfdrivingcarengineer',
 'clahe',
 'obfuscator',
 'coroutinesandroid',
 'rulebased',
 'switzerl

In [27]:
def myclnfunction(x):
    return preprocess_text(x, lem=True, token_list = topics_combined_full, do_composition=True)

In [28]:
t1 = github_data_df['name'].apply(lambda x: myclnfunction(x))

In [29]:
t1_cnt = Counter(t1.explode())
t1_cnt.most_common()

[(nan, 193576),
 ('js', 873),
 ('microsoft', 482),
 ('io', 343),
 ('github', 310),
 ('ibm', 206),
 ('jquery', 194),
 ('azure', 186),
 ('linuxserver', 172),
 ('google', 165),
 ('pytorch', 154),
 ('facedetection', 151),
 ('uboot', 147),
 ('net', 138),
 ('awesome', 129),
 ('awesomepython', 117),
 ('objectdetection', 115),
 ('intel', 114),
 ('googlecloudplatform', 111),
 ('publicapis', 111),
 ('iobroker', 107),
 ('anomalydetection', 100),
 ('apache', 98),
 ('awesomemachinelearning', 96),
 ('newrelic', 88),
 ('aws', 86),
 ('1', 84),
 ('jenkinsci', 83),
 ('project', 78),
 ('machinelearning', 77),
 ('mozilla', 74),
 ('facerecognition', 74),
 ('computervision', 71),
 ('creditcardfrauddetection', 68),
 ('openpilot', 68),
 ('fakenewsdetection', 67),
 ('vim', 66),
 ('core', 65),
 ('deeplearning', 64),
 ('prometheus', 64),
 ('facemaskdetection', 63),
 ('alert', 62),
 ('docker', 59),
 ('dotfiles', 58),
 ('ai', 58),
 ('splunk', 58),
 ('infineon', 57),
 ('seanpm2001', 57),
 ('oracle', 55),
 ('homeass

In [30]:
t2 = github_data_df['topics'].apply(lambda x: myclnfunction(x))

In [31]:
t2_cnt = Counter(t2.explode())
t2_cnt.most_common()

[(nan, 153668),
 ('python', 6323),
 ('deeplearning', 5300),
 ('machinelearning', 4665),
 ('computervision', 3010),
 ('javascript', 2744),
 ('security', 2657),
 ('objectdetection', 2617),
 ('pytorch', 2166),
 ('tensorflow', 2125),
 ('opencv', 2055),
 ('hacktoberfest', 2010),
 ('python3', 1781),
 ('android', 1676),
 ('docker', 1658),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1373),
 ('monitor', 1285),
 ('aws', 1264),
 ('golang', 1252),
 ('kubernetes', 1223),
 ('awesomelist', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('imageprocessing', 1000),
 ('facedetection', 987),
 ('typescript', 971),
 ('keras', 871),
 ('windows', 848),
 ('datascience', 836),
 ('nlp', 823),
 ('cybersecurity', 819),
 ('api', 802),
 ('go', 783),
 ('artificialintelligence', 738),
 ('anomalydetection', 738),
 ('alert', 737),
 ('raspberrypi', 736),
 ('hack', 727),
 ('swift', 721),
 ('convolutionalneuralnetworks', 704),
 ('cnn', 698),
 ('ai', 672),
 ('cpp', 660

In [32]:
t3 = github_data_df['about'].apply(lambda x: myclnfunction(x))

In [33]:
t3_cnt = Counter(t3.explode())
t3_cnt.most_common()

[(nan, 33273),
 ('detection', 25620),
 ('learn', 11396),
 ('data', 10783),
 ('project', 10742),
 ('tool', 10131),
 ('code', 9236),
 ('image', 9042),
 ('base', 8788),
 ('1', 8014),
 ('python', 7972),
 ('library', 7828),
 ('list', 7768),
 ('network', 7280),
 ('object', 6958),
 ('set', 6935),
 ('model', 6785),
 ('system', 6600),
 ('api', 6266),
 ('web', 6129),
 ('detect', 6015),
 ('implementation', 5941),
 ('file', 5867),
 ('application', 5523),
 ('simple', 5499),
 ('app', 5473),
 ('script', 5352),
 ('repository', 5332),
 ('build', 5273),
 ('https', 5138),
 ('c', 4920),
 ('machine', 4848),
 ('deep', 4808),
 ('js', 4655),
 ('plugin', 4633),
 ('framework', 4614),
 ('paper', 4547),
 ('source', 4511),
 ('security', 4441),
 ('android', 4430),
 ('alert', 4395),
 ('create', 4391),
 ('face', 4191),
 ('support', 4170),
 ('class', 4135),
 ('monitor', 4100),
 ('make', 4050),
 ('github', 4023),
 ('open', 4004),
 ('service', 3924),
 ('resources', 3885),
 ('analysis', 3870),
 ('http', 3808),
 ('awesome

In [34]:
t4 = github_data_df['languages'].apply(lambda x: myclnfunction(x))

In [35]:
t4_cnt = Counter(t4.explode())
t4_cnt.most_common()

[('python', 85119),
 ('c', 60229),
 ('shell', 57069),
 ('javascript', 49677),
 ('html', 40935),
 ('css', 29797),
 (nan, 27065),
 ('makefile', 24513),
 ('jupyter', 19789),
 ('notebook', 19789),
 ('dockerfile', 19663),
 ('java', 17412),
 ('ruby', 10653),
 ('objectivec', 10510),
 ('cmake', 10454),
 ('php', 9441),
 ('batchfile', 8983),
 ('typescript', 8785),
 ('go', 8276),
 ('matlab', 6311),
 ('perl', 5473),
 ('powershell', 4618),
 ('cuda', 4374),
 ('swift', 3961),
 ('assembly', 3892),
 ('scss', 3615),
 ('r', 3512),
 ('kotlin', 2567),
 ('lua', 2543),
 ('tex', 2374),
 ('hcl', 2360),
 ('rust', 2280),
 ('vue', 1890),
 ('scala', 1300),
 ('dart', 1199),
 ('coffeescript', 1191),
 ('glsl', 1181),
 ('groovy', 1175),
 ('tcl', 1098),
 ('awk', 1053),
 ('lisp', 1047),
 ('script', 958),
 ('vim', 942),
 ('jinja', 907),
 ('lex', 899),
 ('cython', 895),
 ('solidity', 839),
 ('asp', 810),
 ('less', 810),
 ('emacs', 769),
 ('tsql', 760),
 ('pascal', 718),
 ('basic', 676),
 ('gdb', 627),
 ('visual', 620),
 (

In [36]:
t5 = github_data_df['readme_clean'].apply(lambda x: myclnfunction(x))

In [37]:
t5_cnt = Counter(t5.explode())
t5_cnt.most_common()

[('1', 1462653),
 ('file', 1280296),
 ('data', 1116929),
 ('code', 841991),
 ('test', 757078),
 ('image', 740513),
 ('create', 708906),
 ('build', 706714),
 ('set', 696386),
 ('name', 661142),
 ('support', 614297),
 ('function', 576514),
 ('model', 570529),
 ('project', 569795),
 ('make', 560286),
 ('http', 558639),
 ('c', 530918),
 ('app', 515640),
 ('get', 506224),
 ('learn', 502459),
 ('python', 496298),
 ('example', 491264),
 ('network', 477513),
 ('js', 467012),
 ('type', 466813),
 ('api', 446833),
 ('server', 444425),
 ('tool', 444002),
 ('user', 443096),
 ('install', 439300),
 ('object', 435509),
 ('command', 434523),
 ('work', 431994),
 ('list', 431816),
 ('time', 431029),
 ('5', 422647),
 ('service', 415736),
 ('source', 413675),
 ('change', 410782),
 ('x', 406484),
 ('system', 389516),
 ('script', 380698),
 ('include', 379772),
 ('string', 378714),
 ('feature', 372346),
 ('version', 370540),
 ('library', 368236),
 ('log', 363241),
 ('process', 361301),
 ('number', 352768),
 (

In [38]:
t6 = github_data_df['tree'].apply(lambda x: myclnfunction(x))

In [39]:
t6_cnt = Counter(t6.explode())
t6_cnt.most_common()

[('md', 325331),
 ('readme', 226553),
 ('license', 122166),
 ('json', 76702),
 ('txt', 72106),
 ('yml', 54284),
 ('js', 47756),
 ('sh', 45016),
 ('go', 39764),
 ('src', 38999),
 ('test', 38897),
 ('github', 36783),
 ('c', 35230),
 ('pdf', 34427),
 ('png', 32537),
 ('package', 31035),
 ('config', 30235),
 ('ipynb', 26781),
 ('requirements', 23195),
 ('docs', 20048),
 ('makefile', 19489),
 ('contribute', 18652),
 ('setup', 18332),
 ('travis', 18122),
 ('cpp', 17375),
 ('gradle', 17296),
 ('image', 17248),
 ('php', 17190),
 ('changelog', 16603),
 ('yaml', 16327),
 ('data', 16306),
 ('xml', 15719),
 ('build', 15489),
 ('html', 15164),
 ('script', 15031),
 ('lib', 14292),
 ('dockerfile', 13717),
 ('lock', 12860),
 ('index', 12590),
 ('model', 12531),
 ('examples', 11218),
 ('jpg', 10511),
 ('app', 10330),
 ('utils', 9517),
 ('editorconfig', 9067),
 ('example', 8789),
 ('tool', 8714),
 ('bat', 8635),
 ('assets', 8324),
 ('cfg', 8289),
 ('1', 7544),
 ('bin', 7292),
 ('toml', 7251),
 ('manifes

### testset 2.3.1(s)
takes all 2.3 readme_clean extracted topics as set

In [44]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t5
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.1')

total repos 218335
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.1.pkl


In [45]:
ghd_exp['topics'] = ghd_exp['topics'].apply(lambda x: ' '.join(set(x)))

In [46]:
to_disk(ghd_exp, 't2.3.1s')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.1s.pkl


### testset 2.3.2
takes all 2.3 about extracted topics

In [47]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t3 #+t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.2')

total repos 188698
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.2.pkl


### testset 2.3.3
takes all 2.5 tree extracted topics

In [48]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t6
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.3')

total repos 221696
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.3.pkl


### testset 2.3.4
takes all 2.5 topics extracted topics

In [49]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.4')

total repos 68303
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.4.pkl


### testset 2.3.5
takes all 2.5 languages extracted topics

In [50]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t4
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.5')

total repos 194906
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.5.pkl


### testset 2.3.6
takes all 2.3 title extracted topics

In [51]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t1
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.3.6')

total repos 28395
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.3.6.pkl


## Testset 2.4 (Part of D_C3)
The best performing 1 set was 1.2; so we do the same we did with 1.2 on readme and about

we set a **min_thres** here, that means this topic has to be at least in 4 repos

In [16]:
total_repos_with_topics*0.001

69.81700000000001

In [17]:
topics_combined_full24 = []
min_thres = total_repos_with_topics*0.001
for i in topic_counter.items():
    t = i[0]
    o = i[1]
    t = str(t)
    if t == 'nan':
        continue
    newt = preprocess_text(t, lem=True, do_composition=True)
    if newt not in topics_combined_full24:
        if o >= min_thres:
            topics_combined_full24 += newt


In [18]:
len(set(topics_combined_full24))

743

In [19]:
len(topics_combined_full24)

773

In [20]:
topics_combined_full24 = list(set(topics_combined_full24))

In [21]:
def myclnfunction(x):
    return preprocess_text(x, lem=True, token_list = topics_combined_full24, do_composition=True)

In [22]:
t1 = github_data_df['name'].apply(lambda x: myclnfunction(x))

In [23]:
t1_cnt = Counter(t1.explode())
t1_cnt.most_common()

[(nan, 212929),
 ('js', 873),
 ('microsoft', 482),
 ('github', 310),
 ('jquery', 194),
 ('azure', 186),
 ('google', 165),
 ('pytorch', 154),
 ('facedetection', 151),
 ('awesome', 129),
 ('objectdetection', 115),
 ('intel', 114),
 ('googlecloudplatform', 111),
 ('anomalydetection', 100),
 ('aws', 86),
 ('project', 78),
 ('machinelearning', 77),
 ('facerecognition', 74),
 ('computervision', 71),
 ('vim', 66),
 ('deeplearning', 64),
 ('prometheus', 64),
 ('alert', 62),
 ('docker', 59),
 ('ai', 58),
 ('splunk', 58),
 ('bootstrap', 53),
 ('paper', 53),
 ('openshift', 53),
 ('grafana', 51),
 ('tool', 50),
 ('resources', 50),
 ('tensorflow', 49),
 ('yararules', 46),
 ('script', 45),
 ('darknet', 43),
 ('frauddetection', 43),
 ('tutorials', 43),
 ('vehicledetection', 43),
 ('cnn', 41),
 ('misp', 41),
 ('monitor', 40),
 ('lanedetection', 39),
 ('opencv', 38),
 ('blog', 37),
 ('ids', 37),
 ('owasp', 37),
 ('javascript', 36),
 ('python', 36),
 ('udacity', 36),
 ('algorithms', 36),
 ('transformers

In [24]:
t2 = github_data_df['topics'].apply(lambda x: myclnfunction(x))

In [25]:
t2_cnt = Counter(t2.explode())
t2_cnt.most_common()

[(nan, 161626),
 ('python', 6323),
 ('deeplearning', 5300),
 ('machinelearning', 4665),
 ('computervision', 3010),
 ('javascript', 2744),
 ('security', 2657),
 ('objectdetection', 2617),
 ('pytorch', 2166),
 ('tensorflow', 2125),
 ('opencv', 2055),
 ('hacktoberfest', 2010),
 ('python3', 1781),
 ('android', 1676),
 ('docker', 1658),
 ('java', 1523),
 ('react', 1463),
 ('nodejs', 1373),
 ('monitor', 1285),
 ('aws', 1264),
 ('golang', 1252),
 ('kubernetes', 1223),
 ('awesomelist', 1220),
 ('detection', 1214),
 ('awesome', 1139),
 ('linux', 1097),
 ('ios', 1059),
 ('php', 1015),
 ('imageprocessing', 1000),
 ('facedetection', 987),
 ('typescript', 971),
 ('keras', 871),
 ('windows', 848),
 ('datascience', 836),
 ('nlp', 823),
 ('cybersecurity', 819),
 ('api', 802),
 ('go', 783),
 ('artificialintelligence', 738),
 ('anomalydetection', 738),
 ('alert', 737),
 ('raspberrypi', 736),
 ('hack', 727),
 ('swift', 721),
 ('convolutionalneuralnetworks', 704),
 ('cnn', 698),
 ('ai', 672),
 ('cpp', 660

In [26]:
t3 = github_data_df['about'].apply(lambda x: myclnfunction(x))

In [27]:
t3_cnt = Counter(t3.explode())
t3_cnt.most_common()

[(nan, 53413),
 ('detection', 25620),
 ('learn', 11396),
 ('data', 10783),
 ('project', 10742),
 ('tool', 10131),
 ('code', 9236),
 ('image', 9042),
 ('python', 7972),
 ('library', 7828),
 ('list', 7768),
 ('network', 7280),
 ('api', 6266),
 ('web', 6129),
 ('application', 5523),
 ('app', 5473),
 ('script', 5352),
 ('https', 5138),
 ('c', 4920),
 ('js', 4655),
 ('plugin', 4633),
 ('framework', 4614),
 ('paper', 4547),
 ('security', 4441),
 ('android', 4430),
 ('alert', 4395),
 ('face', 4191),
 ('monitor', 4100),
 ('github', 4023),
 ('resources', 3885),
 ('analysis', 3870),
 ('http', 3808),
 ('awesome', 3777),
 ('javascript', 3734),
 ('test', 3644),
 ('server', 3494),
 ('go', 3103),
 ('program', 3022),
 ('google', 3014),
 ('track', 2968),
 ('recognition', 2892),
 ('cloud', 2860),
 ('react', 2860),
 ('windows', 2792),
 ('package', 2778),
 ('dataset', 2741),
 ('module', 2738),
 ('ios', 2687),
 ('aws', 2679),
 ('video', 2661),
 ('design', 2605),
 ('node', 2539),
 ('train', 2507),
 ('game',

In [28]:
t4 = github_data_df['languages'].apply(lambda x: myclnfunction(x))

In [29]:
t4_cnt = Counter(t4.explode())
t4_cnt.most_common()

[('python', 85119),
 ('c', 60229),
 ('shell', 57069),
 ('javascript', 49677),
 ('html', 40935),
 (nan, 30353),
 ('css', 29797),
 ('jupyter', 19789),
 ('java', 17412),
 ('ruby', 10653),
 ('objectivec', 10510),
 ('cmake', 10454),
 ('php', 9441),
 ('typescript', 8785),
 ('go', 8276),
 ('matlab', 6311),
 ('perl', 5473),
 ('powershell', 4618),
 ('cuda', 4374),
 ('swift', 3961),
 ('assembly', 3892),
 ('scss', 3615),
 ('r', 3512),
 ('kotlin', 2567),
 ('lua', 2543),
 ('rust', 2280),
 ('vue', 1890),
 ('scala', 1300),
 ('dart', 1199),
 ('script', 958),
 ('vim', 942),
 ('solidity', 839),
 ('emacs', 769),
 ('clojure', 606),
 ('yara', 582),
 ('hack', 543),
 ('arduino', 454),
 ('elixir', 382),
 ('language', 374),
 ('protocol', 258),
 ('svelte', 212),
 ('ml', 203),
 ('sass', 145),
 ('markdown', 129),
 ('module', 123),
 ('management', 123),
 ('yaml', 104),
 ('nginx', 103),
 ('json', 93),
 ('webassembly', 91),
 ('api', 49),
 ('game', 40),
 ('workflow', 34),
 ('js', 28),
 ('xml', 24),
 ('template', 19),

In [None]:
t5 = github_data_df['readme_clean'].apply(lambda x: myclnfunction(x))

In [42]:
t5_cnt = Counter(t5.explode())
t5_cnt.most_common()

[('1', 1462653),
 ('file', 1280296),
 ('data', 1116929),
 ('code', 841991),
 ('test', 757078),
 ('image', 740513),
 ('create', 708906),
 ('build', 706714),
 ('set', 696386),
 ('name', 661142),
 ('support', 614297),
 ('function', 576514),
 ('model', 570529),
 ('project', 569795),
 ('make', 560286),
 ('http', 558639),
 ('c', 530918),
 ('app', 515640),
 ('get', 506224),
 ('learn', 502459),
 ('python', 496298),
 ('example', 491264),
 ('network', 477513),
 ('js', 467012),
 ('type', 466813),
 ('api', 446833),
 ('server', 444425),
 ('tool', 444002),
 ('user', 443096),
 ('install', 439300),
 ('object', 435509),
 ('command', 434523),
 ('work', 431994),
 ('list', 431816),
 ('time', 431029),
 ('5', 422647),
 ('service', 415736),
 ('source', 413675),
 ('change', 410782),
 ('x', 406484),
 ('system', 389516),
 ('script', 380698),
 ('include', 379772),
 ('string', 378714),
 ('feature', 372346),
 ('version', 370540),
 ('library', 368236),
 ('log', 363241),
 ('process', 361301),
 ('number', 352768),
 (

In [None]:
t6 = github_data_df['tree'].apply(lambda x: myclnfunction(x))

In [43]:
t6_cnt = Counter(t6.explode())
t6_cnt.most_common()

[('md', 325331),
 ('readme', 226553),
 ('license', 122166),
 ('json', 76702),
 ('txt', 72106),
 ('yml', 54284),
 ('js', 47756),
 ('sh', 45016),
 ('go', 39764),
 ('src', 38999),
 ('test', 38897),
 ('github', 36783),
 ('c', 35230),
 ('pdf', 34427),
 ('png', 32537),
 ('package', 31035),
 ('config', 30235),
 ('ipynb', 26781),
 ('requirements', 23195),
 ('docs', 20048),
 ('makefile', 19489),
 ('contribute', 18652),
 ('setup', 18332),
 ('travis', 18122),
 ('cpp', 17375),
 ('gradle', 17296),
 ('image', 17248),
 ('php', 17190),
 ('changelog', 16603),
 ('yaml', 16327),
 ('data', 16306),
 ('xml', 15719),
 ('build', 15489),
 ('html', 15164),
 ('script', 15031),
 ('lib', 14292),
 ('dockerfile', 13717),
 ('lock', 12860),
 ('index', 12590),
 ('model', 12531),
 ('examples', 11218),
 ('jpg', 10511),
 ('app', 10330),
 ('utils', 9517),
 ('editorconfig', 9067),
 ('example', 8789),
 ('tool', 8714),
 ('bat', 8635),
 ('assets', 8324),
 ('cfg', 8289),
 ('1', 7544),
 ('bin', 7292),
 ('toml', 7251),
 ('manifes

### testset 2.4.1s
takes all 2.4 readme_clean extracted topics as set

In [44]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t5
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.1')

total repos 217922
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.1.pkl


In [45]:
ghd_exp['topics'] = ghd_exp['topics'].apply(lambda x: ' '.join(set(x)))

In [46]:
to_disk(ghd_exp, 't2.4.1s')

/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.1s.pkl


### testset 2.4.2
takes all 2.4 about extracted topics

In [47]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t3 #+t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.2')

total repos 168558
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.2.pkl


### testset 2.4.3
takes all 2.4 tree extracted topics

In [48]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t6
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.3')

total repos 167102
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.3.pkl


### testset 2.4.4
takes all 2.4 topics extracted topics

In [49]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t2
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.4')

total repos 60345
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.4.pkl


### testset 2.4.5
takes all 2.5 languages extracted topics

In [50]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t4
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.5')

total repos 191618
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.5.pkl


### testset 2.4.6
takes all 2.4 title extracted topics

In [51]:
ghd_exp = pd.DataFrame()
ghd_exp['name'] = github_data_df['name'].copy()
ghd_exp['topics'] = t1
print(f"total repos {len(ghd_exp[ghd_exp['topics'].str.len() > 0])}")
ghd_exp = ghd_exp[ghd_exp['topics'].str.len() > 0]
#ghd_exp = github_data_df[github_data_df['topics'].str.len() > 0][['name', 'topics']].copy()
#ghd_exp
to_disk(ghd_exp, 't2.4.6')

total repos 9042
/home/manuel/d3tect-tools/Notebooks/raw_testdata_v1/t2.4.6.pkl
