# Fit to Regression model
See if using Machine Learning package have influence on commit count, release count and other count features

# Statistic

 number  | have ml |
------| ---- | 
company | 150 | 
repo | 498 | 

### Check commit count

In [61]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels import RandomEffects
import statsmodels.formula.api as smf
import statsmodels.api as sm


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


In [31]:
df = pd.read_csv('df_commit_v3.csv')
df.groupby(['repo']).sum().iloc[:, 3:10]

Unnamed: 0_level_0,commit_count,fork_count,release_count,pull_count,branch_count,issue_count,stargazer_count
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18F/rdbms-subsetter,136.0,28.0,0.0,3.0,2.0,17.0,317.0
6aika/issue-reporting,831.0,4.0,0.0,0.0,3.0,7.0,6.0
Asana/bazel,9625.0,1.0,0.0,0.0,6.0,0.0,0.0
Asana/boto,6964.0,0.0,0.0,0.0,12.0,0.0,0.0
Autodesk/molecular-design-toolkit,837.0,23.0,11.0,4.0,7.0,25.0,108.0
...,...,...,...,...,...,...,...
yahoo/TensorFlowOnSpark,569.0,879.0,19.0,0.0,1.0,2.0,3437.0
yahoo/lopq,43.0,124.0,0.0,2.0,2.0,15.0,505.0
yahoo/serviceping,95.0,12.0,0.0,1.0,3.0,2.0,32.0
zadgroup/edx-platform,31363.0,0.0,0.0,0.0,17.0,0.0,0.0


# Fit Regression

In [21]:
df.fillna(0, inplace = True)
df['datetime'] = pd.DatetimeIndex(df['datetime']) # categorize not categorical feautures
cat_datetime = pd.Categorical(df['datetime'])
df['cat_year'] = pd.Categorical(df['commit_year'])

In [22]:
df = df.set_index(['repo', 'datetime'])
df['cat_datetime'] = cat_datetime

## Based on Repo - commit count
### Random Effect (same as STATA xtset n_repo, xreg with i.datetime )

In [24]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_release', 'log_issue', 'log_pull', 'cat_datetime']
x = sm.add_constant(df[x])
re_model = RandomEffects(df['log_commit'], x)
re_reg = re_model.fit()
# print(re_reg)

<img src = 'https://drive.google.com/uc?id=1XvSHy3r7b4jGQFMI-8w2ywhq1G0Tu2lV' width ='700'>

### Fix Effect

In [26]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_release', 'log_issue', 'log_pull', 'cat_datetime']
x = sm.add_constant(df[x])
fe_model = PanelOLS(df['log_commit'], x)
fe_reg = fe_model.fit()
# print(fe_reg)

<img src = 'https://drive.google.com/uc?id=1BH1adRhuQWIepryf0g6Rfud-3rW2m0Gx' width = '700'>

### R package OLS same as PanelOLS

In [37]:
FE_ols = smf.ols(formula='log_commit ~ 1 + ml_commit+ log_fork+ log_branch +log_stargazer+ log_release+ log_issue+ log_pull+ C(cat_datetime)', data = df).fit()
# print(FE_ols.summary())



### Robust result

In [39]:
results_robust = FE_ols.get_robustcov_results(cov_type='HC1')
# print(results_robust.summary())

## Based on Repo - release count
### Random Effect 

In [42]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_commit', 'log_issue', 'log_pull', 'cat_datetime']
x = sm.add_constant(df[x])
re_model = RandomEffects(df['log_release'], x)
re_reg = re_model.fit()
# print(re_reg)

<img src = 'https://drive.google.com/uc?id=1AM_5Dhp75CoBC05F2prq-6CijQydp4Rg' width ='700'>

### Fix Effect

In [45]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_commit', 'log_issue', 'log_pull', 'cat_datetime']
x = sm.add_constant(df[x])
fe_model = PanelOLS(df['log_release'], x)
fe_reg = fe_model.fit()
# print(fe_reg)

<img src = 'https://drive.google.com/uc?id=1vCdDJZuD1PZLlZxDB5E8FDmUIwLNUA07' width = '700'>

## Based on Company - commit count
### Random Effect (same as STATA xtset n_repo, xreg with i.datetime )

In [63]:
df_company = df.reset_index().groupby(['company', 'datetime']).mean()
c_datetime = df_company.reset_index()['datetime']
df_company.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,commit_year,commit_month,ml_commit,commit_count,fork_count,release_count,pull_count,branch_count,issue_count,stargazer_count,haveMatplotlib,haveScipy,haveTensorflow,havePandas,haveSklearn,haveKeras,havePyTorch,haveNumpy,haveGgplot,haveSeaborn,havePyspark,haveMath,haveNltk,haveGoogle,haveEdward,havePylab,haveTheano,haveBs4,haveScrapy,haveLime,havePattern,haveMetric_learn,haveMars,haveNetwork,haveCaffee,haveSympy,haveMxnet,haveAstropy,log_commit,log_fork,log_release,log_branch,log_issue,log_pull,log_stargazer
company,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
18F,2014-10-01,2014,10,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.410987,0.0,0.0,0.0,0.09531,0.0,0.09531
18F,2014-11-01,2014,11,0.0,20.0,0.0,0.0,0.0,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.00072,0.0,0.0,0.0,0.741937,0.0,2.208274
18F,2014-12-01,2014,12,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.312535,0.0,0.0,0.0,0.0,0.0,0.0
18F,2015-01-01,2015,1,0.0,5.0,7.0,0.0,0.0,0.0,0.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.629241,1.960095,0.0,0.0,0.0,0.0,5.215479
18F,2015-02-01,2015,2,0.0,8.0,3.0,0.0,0.0,0.0,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.091864,1.131402,0.0,0.0,0.741937,0.0,2.778819


In [64]:
df_company['c_datetime'] = pd.Categorical(c_datetime)

In [68]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_release', 'log_issue', 'log_pull', 'c_datetime']
x = sm.add_constant(df_company[x])
re_model = RandomEffects(df_company['log_commit'], x)
re_reg = re_model.fit()
# print(re_reg)

<img src = 'https://drive.google.com/uc?id=12TKzyymTEThFPdGo_0olVzVqL_slswCO' width ='700'>

### Fix Effect

In [72]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_release', 'log_issue', 'log_pull', 'c_datetime']
x = sm.add_constant(df_company[x])
fe_model = PanelOLS(df_company['log_commit'], x)
fe_reg = fe_model.fit()
# print(fe_reg)

<img src = 'https://drive.google.com/uc?id=13C6tZwmDq15zM6taxS24lMqNnGsUBVNN' width = '700'>

### R package OLS same as PanelOLS

In [37]:
FE_ols = smf.ols(formula='log_commit ~ 1 + ml_commit+ log_fork+ log_branch +log_stargazer+ log_release+ log_issue+ log_pull+ C(c_datetime)', data = df_company).fit()
# print(FE_ols.summary())



### Robust result

In [39]:
results_robust = FE_ols.get_robustcov_results(cov_type='HC1')
# print(results_robust.summary())

## Based on Company - release count
### Random Effect 

In [77]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_commit', 'log_issue', 'log_pull', 'c_datetime']
x = sm.add_constant(df_company[x])
re_model = RandomEffects(df_company['log_release'], x)
re_reg = re_model.fit()
# print(re_reg)

<img src = 'https://drive.google.com/uc?id=1d6sREhA7nbykpG4eoZVOsqlwcTiwPPZi' width ='700'>

### Fix Effect

In [80]:
x = ['ml_commit', 'log_fork', 'log_branch',
    'log_stargazer', 'log_commit', 'log_issue', 'log_pull', 'c_datetime']
x = sm.add_constant(df_company[x])
fe_model = PanelOLS(df_company['log_release'], x)
fe_reg = fe_model.fit()
# print(fe_reg)

<img src = 'https://drive.google.com/uc?id=1T5db1Sp2xsToqfSFYjHXLuR7ip_UnExx' width = '700'>