In [None]:
# Modeling time series
import json
import codecs
import sys
import logging
import pandas
import pretty_errors
from dateutil.parser import parse

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging.INFO)


def loadJson(filename):
    logging.info('Loading ' + filename)
    input = codecs.open(filename, 'r', 'utf8')
    result = json.loads(input.read())
    input.close()
    return result


def printJson(data, filename):
    logging.info('Printing ' + filename)
    output = codecs.open(filename, 'w', 'utf8')
    output.write(json.dumps(data, ensure_ascii=False))
    output.close()
    return


class TimeSeries:
    def __init__(self, project: str, behavior: str):
        self.project = project
        logging.info('Project: %s', project)
        self.result = {}
        self.commits = {}
        self.sorted_commits = []
        self.developers = {}
        self.behavior = behavior

        # load commits from file
        all_commits = loadJson('Data/commits/' + project +
                               '_commit_alias.json')
        logging.info('%s - Total commits: %s', project, len(all_commits))

        for commit in all_commits:
            self.commits[commit['sha']] = commit
            self.sorted_commits.append((parse(commit['date']), commit['sha']))
            author = commit['author']
            if author not in self.developers:
                self.developers[author] = []
            self.developers[author].append(
                (parse(commit['date']), commit['sha']))

        self.sorted_commits.sort()
        logging.info('%s - Finish loading data')
        return

    def GenerateTSForDeveloper(self, developer: str, behavior: str, n: int):
        logging.info('%s - Developer: %s', self.project, developer)
        # sort developer's commits by date
        self.developers[developer].sort()
        # get trace data by behavior
        trace = []
        for dt, sha in self.developers[developer]:
            commit = self.commits[sha]
            count = 0
            for f in commit['modification']:
                if f['change_type'] == behavior:
                    count = count + 1
            trace.append((dt, count))

        first, ts = self.__generate_ts(trace, n)
        return first, ts

    def GenerateForAll(self, n: int):
        logging.info('%s - Generating time series', self.project)
        for developer in self.developers:
            first, ts = self.GenerateTSForDeveloper(developer, self.behavior,
                                                    n)
            if ts:
                self.result[developer + ' - ' + str(first)] = ts
        return

    def OutputTS(self):
        logging.info('%s - Output time series to file', self.project)
        # output to json
        printJson(self.result,
                  'Data/ts/' + self.project + '_' + self.behavior + '_ts.json')

        # output to csv
        self.__to_csv()
        return

    def __generate_ts(self, trace: list, n: int):
        terminated = parse('2020-05-31 23:59:59-12:00')
        # sort the trace
        trace.sort()
        # check if there is enough data
        first_dt = trace[0][0]
        if (terminated - first_dt).days >= n:
            # initialize the time series
            result = [
                0,
            ] * n
            for dt, count in trace:
                index = (dt - first_dt).days
                if index < n:
                    result[index] = result[index] + count
            return first_dt, result
        else:
            logging.error('No enough data to generate time series')
            return first_dt, None

    def __to_csv(self):
        # transfor to DataFrame
        temp = []
        for developer in self.result:
            for i in range(len(self.result[developer])):
                temp.append([
                    self.project + ' - ' + developer, i,
                    self.result[developer][i]
                ])

        result = pandas.DataFrame(data=temp,
                                  columns=['developer', 'day', self.behavior])
        # output to csv file
        result.to_csv(path_or_buf='Data/ts/' + self.project + '_' +
                      self.behavior + '_ts.csv',
                      header=True,
                      index=False)
        return


projects = loadJson('Data/Target_projects.json')
for project in projects:
    ts = TimeSeries(project=project['name'], behavior='MODIFY')
    ts.GenerateForAll(n=180)
    ts.OutputTS()


In [None]:
# Extract time series features
import sys
import pandas
import logging
from tsfresh import extract_features
from tsfresh.feature_extraction.feature_calculators import *
from tsfresh.feature_extraction import *

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

# input target ecosystem
eco = sys.argv[1]

# load time series
data = pandas.read_csv('Data/ts/ecosystem_' + eco + '_MODIFY_ts.csv', header=0)
logging.info('Ecosystem: %s', eco)
logging.info(data.shape)

# Extract ts features
settings = ComprehensiveFCParameters()
logging.info('Extracting features')
result = extract_features(data,
                          column_id='developer',
                          column_sort='day',
                          default_fc_parameters=settings,
                          n_jobs=8)
logging.info('Output result to file')
result.to_csv(path_or_buf='Data/ts_features/' + eco + 'MODIFY_features.csv',
              header=True,
              index=True)


In [None]:
#Extract ts features for documentation contribution
# 35 newcomers who have both code and doc contribution
import pandas
from tsfresh.feature_extraction import *
from tsfresh import extract_features

LEN = 180

#Load 35 newcomers
raw_newcomers = pandas.read_excel('Data/Newcomers with both contribution.xlsx',
                                  header=0,
                                  engine='openpyxl')
newcomers = []
for index, row in raw_newcomers.iterrows():
    newcomers.append((row['project'], row['author']))
print(len(newcomers))

#Load raw data
raw = pandas.read_csv('Data/doc/Doc_contribution_timeseries.csv', header=0)

#Processing format
data = []
for index, row in raw.iterrows():
    if (row['project'], row['author_login']) in newcomers:
        id = row['project'] + ' - ' + row['author_login']
        for i in range(LEN):
            data.append({'id': id, 'day': i, 'Doc': row[str(i)]})
data = pandas.DataFrame(data)

#Extract features
settings = ComprehensiveFCParameters()
result = extract_features(data,
                          column_id='id',
                          column_sort='day',
                          default_fc_parameters=settings,
                          n_jobs=8)

result.to_csv(path_or_buf='Data/doc/doc_ts_filtered_features_35.csv',
              header=True,
              index=True)
summary = result.describe()
summary.to_csv('Data/doc/doc_ts_filtered_features_summary_35.csv',
               header=True,
               index=True)


In [None]:
#Compiling evidence for case study
import json
import codecs
import pandas
import datetime


def loadJson(filename: str):
    # print('Loading ' + filename)
    result = []
    input = codecs.open(filename, 'r', 'utf8')
    result = json.loads(input.read())
    input.close()
    return result


#Load targeted projects
projects = loadJson('Data/Target_project_doc.json')

#Load commit messages
commit_messages = {}
for project in projects:
    commit_messages[project['name']] = {}
    temp = loadJson('Data/commit_info/' + project['name'] +
                    '_commit_contributor_0601.json')
    for item in temp:
        sha = item['sha']
        if 'author_login' in item:
            login = item['author_login']
        else:
            login = item['author']['name']
        msg = item['message']
        t = datetime.datetime.strptime(item['author']['date'],
                                       '%Y-%m-%dT%H:%M:%SZ')
        commit_messages[project['name']][sha] = (msg, t)

#Load newcomers with both types of contributions
raw_newcomers = pandas.read_excel('Data/Newcomers with both contribution.xlsx',
                                  header=0,
                                  engine='openpyxl')
newcomers = {}
for index, row in raw_newcomers.iterrows():
    if row['project'] not in newcomers:
        newcomers[row['project']] = []
    newcomers[row['project']].append(row['author'])

#Load doc commits
raw_doc = pandas.read_csv('Data/doc/Doc_contribution_commits_targeted.csv',
                          header=0)

result = []
for project in newcomers:
    for author in newcomers[project]:
        #Get list of doc commits
        doc_list = raw_doc[(raw_doc['project'] == project)
                           & (raw_doc['author_login'] == author)]['sha']
        doc_list = list(set(doc_list))

        #Get list of code commits
        raw_code = loadJson('Data/commits/' + project + '_commit_alias.json')
        code_list = []
        for item in raw_code:
            if item['author'] == author:
                code_list.append(item['sha'])
        #Merge two lists
        commits = doc_list + code_list
        commits = list(set(commits))

        #Formatting
        for c in commits:
            temp = {
                'project': project,
                'author': author,
                'sha': c,
                'date': None,
                'msg': None,
                'is_code': False,
                'is_doc': False
            }
            if c in commit_messages[project]:
                temp['date'] = commit_messages[project][c][1]
                temp['msg'] = commit_messages[project][c][0]
            if c in doc_list:
                temp['is_doc'] = True
            if c in code_list:
                temp['is_code'] = True
            result.append(temp)
result = pandas.DataFrame(result)
result.to_csv('Data/Newcomers_commit_msg_list.csv', header=True, index=False)
