### Analysis python module in each python file

In [40]:
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings("ignore")

In [2]:
py = pd.read_csv('/sfs/lustre/bahamut/scratch/yt9mh/RA-github/data/py_content.csv')
py.shape

(37232, 10)

In [3]:
py.head()

Unnamed: 0,content,function,havePandas,haveSklearn,id,language,module,path,ref,repo_name
0,"""""""\n[Datadog](http://www.datadoghq.com/) is a...","'Handler', 'deque'",False,False,d25be756c5507ba966156ec181ea354b6d4c5d95,py,"'Handler', 'logging', 'collections', 'dogapi'",src/diamond/handler/datadog.py,refs/heads/master,Netuitive/netuitive-diamond
1,"""""""\nNOTE: this API is WIP and has not yet bee...","'APIView', 'Response', 'status', 'permissions'...",False,False,7594f5345cc557e8e11e9aaeed55c06829c02b2d,py,"'rest_framework.views', 'rest_framework.respon...",openedx/core/djangoapps/user_api/preferences/v...,refs/heads/master,IndonesiaX/edx-platform
2,"""""""Utilities for writing code that runs on Pyt...",,False,False,eae31454ae697a8759dd9de6f781f9a4f09ed776,py,"'operator', 'sys', 'types', 'io', 'StringIO', ...",vendor-local/lib/python/six.py,refs/heads/master,mozilla/kitchensinkserver
3,from Tkinter import *\n\n\nclass Test(Frame):\...,'*',False,False,f55f1bee723e650c3c0b5f7712476606c42f8caa,py,'Tkinter',python/src/Demo/tkinter/matt/packer-simple.py,refs/heads/master,mozilla/stoneridge
4,# -*- coding: utf-8 -*-\n#####################...,"'models', 'api'",False,False,4113bec01e4d7cdfeb88e1c71ad99acc4e837b7a,py,"'logging', 'openerp'",document_reindex/models/ir_attachment.py,refs/heads/9.0,ClearCorp/knowledge


### Get company_name

In [4]:
py['company_name'] = py['repo_name'].apply(lambda x: str(x).split('/')[0])

### Get module usage frequency 
module reference website
https://github.com/josephmisiti/awesome-machine-learning

In [5]:
py['module'].str.split(expand=True).stack().value_counts().head(40)

'__future__',                                22469
'os',                                         6646
'sys',                                        4142
'numpy',                                      3262
'tensorflow',                                 2844
'json',                                       2263
'logging',                                    2254
're',                                         2078
'unittest',                                   2035
'time',                                       2021
'datetime',                                   1752
'collections',                                1583
'absl',                                       1397
'absl.testing',                               1182
'six',                                        1092
'mock',                                        931
'tempfile',                                    858
'django.conf',                                 846
'subprocess',                                  843
'argparse',                    

In [6]:
from itertools import chain

In [7]:
module_list = ['scipy', 'tensorflow', 'pandas', 'sklearn', 'keras', 'torch', 'numpy', 'edward', 'mxnet', 'lightgbm', 'astropy',
            'sympy', 'stats', 'math', 'nltk', 'spacy', 'bs4', 'scrapy', 'lime', 'mars', 'ggplot', 'seaborn', 'matplotlib',
             'pylab', 'theano', 'pattern', 'metric_learn', 'pyspark', 'networkx','caffe','google']

ml_index = []
for j in module_list:
    ml_module = list(py['module'].apply(lambda x: j in str(x)))

    res = list(filter(lambda i: ml_module[i], range(len(ml_module)))) 
    ml_index.append(res)

In [8]:
ml_index = list(chain.from_iterable(ml_index))
ml_index = list(set(ml_index))

In [9]:
py['repo_name'].value_counts()

tensorflow/probability              719
google/starthinker                  572
tensorflow/tensor2tensor            446
tensorflow/tfx                      442
masschallenge/django-accelerator    432
                                   ... 
douban/pymesos                        1
PolicyStat/selenium-old               1
enstrategic/django-sql-explorer       1
watchmanmonitoring/SavingThrow        1
google/yara-procdump-python           1
Name: repo_name, Length: 2165, dtype: int64

### Set 1 to rows content using ml modules and 0 for no

In [10]:
py['ml_module'] = 0
for i in ml_index:
    py['ml_module'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  py['ml_module'][i] = 1


### Get python files using machine learning modules

In [11]:
py_ml = py.iloc[ml_index, :]
py_ml.shape

(8658, 12)

### Repo for these files

In [12]:
len(set(py_ml['repo_name']))

516

### What companies use those packages?

In [13]:
print(set(py_ml['company_name']))

{'amd', 'osuripple', 'athento', 'translate', 'caffee', 'datascopeanalytics', 'microsoft', 'TuSimple', 'franzinc', 'postmates', 'pennlabs', 'esa', 'FundersClub', 'leancloud', 'Medium', 'portify', 'freshplanet', 'Zemanta', 'indico', 'lechat', 'Clever', 'lyft', 'bosondata', 'biolab', 'svven', 'xeneta', 'infoculture', 'Kray', 'nordam', 'ironmussa', 'spoqa', 'minio', 'librato', 'intenthq', 'cbr', 'bloomberg', 'tst', 'doordash', 'kibitzr', 'TriumphLLC', 'cni', 'bileto', 'TribeMedia', 'sloe', 'onlinecity', 'PolicyStat', 'bq', 'underdogio', 'mo', 'signalfx', 'Netuitive', 'Knotis', 'Electroid', 'collectiveacuity', 'uncovertruth', 'eleme', 'gameduell', 'wakatime', 'smartfile', 'YourMD', 'google', 'vecna', 'dad', 'bigcommerce', 'tableau', 'zegami', 'GeekPark', 'makerbot', 'facebook', 'mapbox', 'vfine', 'hammerlab', 'yahoo', 'axsemantics', 'Microsoft', 'linuxfoundation', 'getyourguide', 'medallia', 'Flat', 'MinnPost', 'Capgemini', 'MateLabs', 'intel', 'andela', '6aika', 'cloudflare', 'tensorflow',

### Company repo using ml vs not using ml

In [46]:
py.groupby(['company_name', 'ml_module']).count().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,content,function,havePandas,haveSklearn,id,language,module,path,ref,repo_name
company_name,ml_module,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18F,0,113,101,113,113,113,113,110,113,113,113
18F,1,1,1,1,1,1,1,1,1,1,1
3scale,0,3,2,3,3,3,3,3,3,3,3
500px,0,3,3,3,3,3,3,3,3,3,3
6aika,0,104,91,104,104,104,104,92,104,104,104
6aika,1,2,2,2,2,2,2,2,2,2,2
91springboard,0,1,1,1,1,1,1,1,1,1,1
99designs,0,4,4,4,4,4,4,4,4,4,4
ASOdesk,0,2,2,2,2,2,2,2,2,2,2
Accenture,0,1,0,1,1,1,1,1,1,1,1


### Get commit info for py_ml

In [14]:
ml_commit = pd.read_csv('py_commit.csv')
ml_commit.shape

(88688, 12)

In [15]:
ml_commit.sort_values(by = 'new_sha1', inplace = True)

In [16]:
# get df without committer name = github
ml_committer_not_github = ml_commit[ml_commit['committer_name'] != 'GitHub']

### Get committer name = GitHub (Merge)

In [17]:
ml_committer_github = ml_commit[ml_commit['committer_name'] == 'GitHub']
ml_committer_github.shape

(7503, 12)

In [47]:
 ml_committer_github.head()

Unnamed: 0,commit,tree,parent,committer_name,commit_time,subject,message,old_path,new_path,old_sha1,new_sha1,repo
38606,cc1168d47124a8fe26bba534738407754784af35,3a86c8eb0c198b22642bb2c9d9e11f9851969b07,"['67d3e73f6ef96ac8cf9dd038ede03f142fb22b13', '...",GitHub,2019-10-09 03:00:38+00:00,Merge pull request #199 from tensorflow/minor-...,Merge pull request #199 from tensorflow/minor-...,lucid/misc/graph_analysis/overlay_graph.py,lucid/misc/graph_analysis/overlay_graph.py,d2636507b92433a5c1d0189fa5faab3a7ca25e24,0002123f6b950404b69967f28f68aab85a8b423f,tensorflow/lucid
48176,2c76fd556b4a32566d7b673411b7622d324df443,32a68dfe314c10ef4f8913e47ac85811f3dd5789,"['388cd26efe15f10527a74b7bc175ec19de0de9eb', '...",GitHub,2019-08-06 17:38:33+00:00,Merge pull request #6 from nkrishnaswami/master,Merge pull request #6 from nkrishnaswami/maste...,samples/eurostat/population_density/transform_...,samples/eurostat/population_density/transform_...,,00191b351b5533d6c0dccc8cf4a2d91e2db253d2,google/dspl
47372,6922bf58d6161c9e0bb05650fdce8f33b3fbbfcd,8a09d02ec1dc9010d6648f7d510d11f8f114bcd6,['fffccc683984d1c2d706d9fda2fd102f6f72e544'],GitHub,2020-01-03 01:21:41+00:00,Update keras2ckpt.py,Update keras2ckpt.py,lingvo/tools/keras2ckpt.py,lingvo/tools/keras2ckpt.py,2a69c7fcd7c0723ea02aa802bb2c3fbca2cca0c7,002a4f53e107c496d5c03ce0ccb1bae04f87f715,tensorflow/lingvo
19026,5f54a117c2b031169b72368ed7ad9566673cabf9,17c63f5940e9774c98e00e3e0acb053a3c0cc684,['f70cbc00aaac4a779305bd92b825fb4dd03adf2a'],GitHub,2019-07-22 15:30:50+00:00,Enable conversion of AutoML Object Detection M...,Enable conversion of AutoML Object Detection M...,python/tensorflowjs/write_weights.py,python/tensorflowjs/write_weights.py,d8e9551a360bcfd97e6d1cb241b290190961e160,00325141eb4563e8b50d9aac07de09f2978f61d7,tensorflow/tfjs-converter
19029,5f54a117c2b031169b72368ed7ad9566673cabf9,17c63f5940e9774c98e00e3e0acb053a3c0cc684,['f70cbc00aaac4a779305bd92b825fb4dd03adf2a'],GitHub,2019-07-22 15:30:50+00:00,Enable conversion of AutoML Object Detection M...,Enable conversion of AutoML Object Detection M...,python/tensorflowjs/write_weights.py,python/tensorflowjs/write_weights.py,d8e9551a360bcfd97e6d1cb241b290190961e160,00325141eb4563e8b50d9aac07de09f2978f61d7,tensorflow/tfjs


### Get duplicate new_sha1

In [18]:
dup_new_sha1 = ml_committer_not_github[ml_committer_not_github.duplicated(['new_sha1'], keep = False)]
test = dup_new_sha1[dup_new_sha1['new_sha1'] == 'fff0ce7e0f17ce0e1355e333604435297853eb99'].sort_values(by = 'commit_time')
test[test['repo'] == 'makerbot/ReplicatorG']

Unnamed: 0,commit,tree,parent,committer_name,commit_time,subject,message,old_path,new_path,old_sha1,new_sha1,repo
71571,b60bd070901bbab27ec35cb2a996daf783eb07cc,87ee7d51f51eb523130b687ad15ee27a420e83d8,['cb01f95c1cff708af808fe2f52c4e44d57fbd69a'],Marius Kintel,2010-11-12 05:10:01+00:00,"Added skeinforge-35, added HOWTO","Added skeinforge-35, added HOWTO\n",skein_engines/skeinforge-35/fabmetheus_utiliti...,skein_engines/skeinforge-35/fabmetheus_utiliti...,,fff0ce7e0f17ce0e1355e333604435297853eb99,makerbot/ReplicatorG
11675,e3d7d94207bf5e8956761c843d1b8fb6f627a734,17116fbf5a629997fb8b2fcd1b491fb352ed6a55,"['1fe92faaa4ccbd08bb201f33f3304667a965bd0d', '...",phooky,2010-11-15 16:02:16+00:00,Merge branch 'skeinforge-35' of https://github...,Merge branch 'skeinforge-35' of https://github...,skein_engines/skeinforge-35/fabmetheus_utiliti...,skein_engines/skeinforge-35/fabmetheus_utiliti...,,fff0ce7e0f17ce0e1355e333604435297853eb99,makerbot/ReplicatorG
72619,6258bfc5caab52a0f548fa1bb3ba4c9fbb1a786c,59a5b60ebaf078c28d822c6d494b108e6411cda5,"['992b827b48096d237eb6b2a1f9046ad0c9e86a78', '...",Charles Edward Pax,2010-11-15 16:44:05+00:00,Merge remote branch 'upstream/master',Merge remote branch 'upstream/master'\n,skein_engines/skeinforge-35/fabmetheus_utiliti...,skein_engines/skeinforge-35/fabmetheus_utiliti...,,fff0ce7e0f17ce0e1355e333604435297853eb99,makerbot/ReplicatorG
6550,6ef3ba4f50c530251c67af59a6c430d8e7e84033,670a439f2bc7a49efc670c931bded8ed64a92ff8,"['848099610ed2e1f2b2148d67f1e75e4351facb5d', '...",D1plo1d,2010-11-25 03:02:25+00:00,Merge branch 'master' of git://github.com/make...,Merge branch 'master' of git://github.com/make...,skein_engines/skeinforge-35/fabmetheus_utiliti...,skein_engines/skeinforge-35/fabmetheus_utiliti...,,fff0ce7e0f17ce0e1355e333604435297853eb99,makerbot/ReplicatorG


In [19]:
py_ml[py_ml['id'] == 'fff0ce7e0f17ce0e1355e333604435297853eb99']

Unnamed: 0,content,function,havePandas,haveSklearn,id,language,module,path,ref,repo_name,company_name,ml_module
4648,"""""""\nSquare path.\n\n""""""\n\nfrom __future__ im...","'absolute_import', 'lineation', 'evaluate', 'V...",False,False,fff0ce7e0f17ce0e1355e333604435297853eb99,py,"'__future__', '__init__', 'fabmetheus_utilitie...",skein_engines/skeinforge-35/fabmetheus_utiliti...,refs/heads/master,makerbot/ReplicatorG,makerbot,1


### Merge 2 tables with new_sha1 and repo_name

In [20]:
py_merge = pd.merge(py_ml, ml_committer_not_github, left_on = ['repo_name', 'id'], right_on = ['repo', 'new_sha1'], how = 'inner',
                   copy = False, sort = True)
py_merge.shape

(9728, 24)

In [21]:
py_merge.loc[:, ['message', 'id', 'commit', 'old_sha1', 'new_sha1', 'repo', 'new_path','commit_time']].tail()

Unnamed: 0,message,id,commit,old_sha1,new_sha1,repo,new_path,commit_time
9723,Merge pull request #5253 from mitocw/bdero/fix...,bc164219c4670e42fb56841489a8b7bb5b2bbb75,115255561460f1595f1b52220f34cf6c65b6384d,2eed06a1709d2e2c0effc2afbba14ec0ce03d5ed,bc164219c4670e42fb56841489a8b7bb5b2bbb75,zadgroup/edx-platform,lms/djangoapps/psychometrics/psychoanalyze.py,2014-09-17 16:50:43+00:00
9724,Fixed problems with psychometrics\n\n- Update ...,bc164219c4670e42fb56841489a8b7bb5b2bbb75,efa4400342c9f8391ff309c39561adb3de4116be,2eed06a1709d2e2c0effc2afbba14ec0ce03d5ed,bc164219c4670e42fb56841489a8b7bb5b2bbb75,zadgroup/edx-platform,lms/djangoapps/psychometrics/psychoanalyze.py,2014-09-17 15:25:00+00:00
9725,Set default args\n\nDefault arg was not gettin...,0cf5600a0523d6ce97716bd2b7e1f5db61b58fff,0eb2c7271b652bfcb036a9319f848f22694b9634,991ee327b00c3926880af7e3f35f0e14e4ba1014,0cf5600a0523d6ce97716bd2b7e1f5db61b58fff,zegami/image-similarity-clustering,extract.py,2018-04-23 11:58:24+00:00
9726,UMAP!\n\nt-SNE is dead! Long live UMAP!!!\n,3d9f75f97f3a489d5384d36e9d2b002b91e384ea,e5fe2e71e0eb641436759feeafaed918dadd1204,,3d9f75f97f3a489d5384d36e9d2b002b91e384ea,zegami/image-similarity-clustering,umap_clustering.py,2019-10-05 15:09:14+00:00
9727,Same fix for tsne\n,dfb1742bdf2ee0cf801d27f55014e0a7403b93f3,3708917ecb39d9d8c39e43bdea8d4ab4ecefd7c8,a5935b9a551c2b99b1fd516d463d52d43f352147,dfb1742bdf2ee0cf801d27f55014e0a7403b93f3,zegami/image-similarity-clustering,tsne.py,2017-09-27 08:25:33+00:00


In [22]:
py_merge.head(20).to_csv('py_merge.csv', index = False)