# Generate Most Frequent Projects Subset

Goal: Include all messages from the 100 projects that have the most commit messages.

#### Load Data

In [1]:
import pandas as pd

data = pd.read_pickle("../data/02_All_Decreased_Filesize.pkl")
data.head(3)

Unnamed: 0,message,author_email,project
0,setup: Detect if wheel and twine installed,gcushen@users.noreply.github.com,gcushen_mezzanine-api
1,[Builder] Adding root page in any case,g.passault@gmail.com,Gregwar_Slidey
2,Added web.Urlencode method,hoisie@gmail.com,hoisie_web


#### Take the subset

For the final model, the 100 projects with the highest amount of commit messages are used.

In [2]:
from collections import Counter

project_count = Counter(data["project"])
included_projects_counter = project_count.most_common(100)

In [3]:
project_count.most_common(100)

[('saltstack_salt', 17501),
 ('moodle_moodle', 16754),
 ('rails_rails', 11565),
 ('gem_oq-engine', 8228),
 ('cakephp_cakephp', 5636),
 ('lxc_lxd', 5584),
 ('juju_juju', 5313),
 ('kubernetes_kubernetes', 5193),
 ('ccxt_ccxt', 5050),
 ('laravel_framework', 4829),
 ('puppetlabs_puppet', 4518),
 ('chef_chef', 4494),
 ('SeleniumHQ_selenium', 4113),
 ('symfony_symfony', 4083),
 ('spyder-ide_spyder', 4038),
 ('keybase_client', 3605),
 ('TryGhost_Ghost', 3225),
 ('koala-framework_koala-framework', 3112),
 ('moby_moby', 3070),
 ('matomo-org_matomo', 2908),
 ('bolt_bolt', 2889),
 ('google_closure-compiler', 2846),
 ('contao_contao', 2816),
 ('Automattic_wp-calypso', 2751),
 ('terraform-providers_terraform-provider-aws', 2740),
 ('concrete5_concrete5', 2670),
 ('jenkinsci_jenkins', 2656),
 ('openlayers_openlayers', 2655),
 ('vitessio_vitess', 2541),
 ('fastlane_fastlane', 2476),
 ('VoltDB_voltdb', 2403),
 ('pyviz_holoviews', 2382),
 ('influxdata_influxdb', 2379),
 ('silverstripe_silverstripe-fram

#### Transform to a list

In [4]:
included_projects = [project[0] for project in included_projects_counter]
included_projects

['saltstack_salt',
 'moodle_moodle',
 'rails_rails',
 'gem_oq-engine',
 'cakephp_cakephp',
 'lxc_lxd',
 'juju_juju',
 'kubernetes_kubernetes',
 'ccxt_ccxt',
 'laravel_framework',
 'puppetlabs_puppet',
 'chef_chef',
 'SeleniumHQ_selenium',
 'symfony_symfony',
 'spyder-ide_spyder',
 'keybase_client',
 'TryGhost_Ghost',
 'koala-framework_koala-framework',
 'moby_moby',
 'matomo-org_matomo',
 'bolt_bolt',
 'google_closure-compiler',
 'contao_contao',
 'Automattic_wp-calypso',
 'terraform-providers_terraform-provider-aws',
 'concrete5_concrete5',
 'jenkinsci_jenkins',
 'openlayers_openlayers',
 'vitessio_vitess',
 'fastlane_fastlane',
 'VoltDB_voltdb',
 'pyviz_holoviews',
 'influxdata_influxdb',
 'silverstripe_silverstripe-framework',
 'buildbot_buildbot',
 'angr_angr',
 'Katello_katello',
 'pypa_setuptools',
 'hashicorp_terraform',
 'fisharebest_webtrees',
 'Koenkk_zigbee-shepherd-converters',
 'hazelcast_hazelcast',
 'cilium_cilium',
 'fog_fog',
 'SAP_openui5',
 'pandas-dev_pandas',
 'ori

#### Filter the dataframe

In [5]:
subset = data.where(data['project'] == included_projects[0])

for project in included_projects[1:]:
    subset = pd.concat([subset, data.where(data['project'] == project)])

In [6]:
subset = subset.dropna()
subset.reset_index(drop=True, inplace=True)
subset

Unnamed: 0,message,author_email,project
0,Ensure topic as bytes when zmq_filtering enabl...,pengyao@pengyao.org,saltstack_salt
1,Fix the process_test.test_kill failure in <I>,janderson@saltstack.com,saltstack_salt
2,Add state.pkg to highstate outputters,thatch45@gmail.com,saltstack_salt
3,Fix mis-naming from pylint cleanup,jacksontj.89@gmail.com,saltstack_salt
4,restartcheck: update function doc\n\nThe doc c...,adrian.ratiu@ni.com,saltstack_salt
...,...,...,...
271268,fix test against inline resources for autoload...,bryanv@continuum.io,bokeh_bokeh
271269,FIX: forgot to save files last time,humongo.shi@gmail.com,bokeh_bokeh
271270,add test for histogram ill-defined data,almar.klein@gmail.com,bokeh_bokeh
271271,Added section reference to DEFAULT_HELP_URL,adam.subanloewen@gmail.com,bokeh_bokeh


#### Check whether the sum is correct

In [7]:
sum([count for _, count in included_projects_counter])

271312

The sum is correct.

#### Save dataset

In [8]:
subset.to_pickle('../data/03b_Projects_Subset.pkl')