# Reading the dataset

First, make sure you have:
* A MongoDB server already installed.
* A dump of the dataset loaded as a collection
* The MongoDB server running.



1. Install MongoDB server. Choose the version according to your OS. https://www.mongodb.com/download-center?jmp=nav#community

2. Unzip the mongo dump files (.json) into a directory

3. Restore/import the dump from the directory by using the command: 

	`mongorestore -d <database_name> <directory_backup>`

NOTE:
The data dump was done using:
	`mongodump -d <database_name> -o <directory_backup>`

In [1]:
import pandas as pd
from pymongo import MongoClient

from bson import json_util, ObjectId
from pandas.io.json import json_normalize
import json

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)
    
    sanitized = json.loads(json_util.dumps(list(cursor)))
    normalized = json_normalize(sanitized)
    
    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(normalized)
    
    # Delete the _id
    if no_id:
        del df['_id.$oid']

    #sf = gl.SFrame(df)
    return df

# Getting the data in csv format

In [2]:
project_names = ['xd', 'dnn', 'apstud', 'mesos', 'mule', 'nexus', 'timob', 'tistud' ]
project_data = {}

# reading from the db
for p in project_names:
    project_data[p] = read_mongo('jira', p)

In [16]:
all_projects = pd.DataFrame()
for p in project_names:
    df = project_data[p]
    
    print "Processing project {0} of shape {1} ...".format(p, df.shape)
    
    df0 = df[[
     'fields.assignee.name',
     'fields.components',
     'fields.created',
     'fields.creator.name',
     'fields.description', # User Story description
     'fields.fixVersions',
     'fields.issuetype.name', 
     'fields.issuetype.subtask', # true, false
     'fields.priority.name', # Minor, Major, Critical, Blocker
     'fields.reporter.name',
     'fields.resolution.description',
     'fields.resolution.name',
     'fields.resolutiondate',
     'fields.status.id',
     'fields.status.name',
     'fields.status.statusCategory.name',
     'fields.summary',
     'fields.updated',
     'fields.versions',
     'fields.watches.watchCount',
     u'key',
    ]].copy()
    
    # List of fields where the story points are stored:
    storypoints_fields = {
        'apstud':'customfield_10003',
        'dnn': 'customfield_10004',
        'mesos':'customfield_12310293',
        'mule':'customfield_10203',
        'timob':'customfield_10003',
        'tistud':'customfield_10003',
        'xd':'customfield_10142',
        'nexus': 'customfield_10132'
    }
    
    df0['storypoints'] = df[ 'fields.' + storypoints_fields[p] ]
    
    df0['project'] = p
    
    # transform components from json objet to list of string
    df0['fields.components'] = df0['fields.components'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
    df0['fields.versions'] = df0['fields.versions'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
    df0['fields.fixVersions'] = df0['fields.fixVersions'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
  
    all_projects = all_projects.append(df0, ignore_index=True)

print "Done."

Processing project xd of shape (3691, 154) ...
Processing project dnn of shape (1894, 237) ...
Processing project apstud of shape (886, 157) ...
Processing project mesos of shape (1472, 172) ...
Processing project mule of shape (1281, 220) ...
Processing project nexus of shape (1071, 158) ...
Processing project timob of shape (1990, 180) ...
Processing project tistud of shape (2870, 168) ...
Done.


In [11]:
all_projects.to_csv("jiradataset_issues.csv", sep=',', encoding='utf-8', doublequote = True, index=False)

In [12]:
import pandas as pd
all_projects = pd.read_csv("jiradataset_issues.csv")

# Getting the changelog

In [5]:
import numpy as np

all_changelogs = pd.DataFrame()
for p in project_names:
    df = project_data[p]
    print "Processing project {0} {1} ...".format(p, df.shape)
    for i, h in df.iterrows():
        #print i
        key = df['key']
        #print h['changelog.histories']
        ch = json_normalize(data=h['changelog.histories'], record_path=['items'], meta=['created', 'author'], errors='ignore')
        
        if not ch.empty:
            ch['author'] = ch['author'].apply(lambda x : x['name'] if isinstance(x,dict) else x)
            ch['key'] = key
            all_changelogs = all_changelogs.append(ch, ignore_index=True)
print 'Done.'

Processing project xd (3691, 154) ...
Processing project dnn (1894, 237) ...
Processing project apstud (886, 157) ...
Processing project mesos (1472, 172) ...
Processing project mule (1281, 220) ...
Processing project nexus (1071, 158) ...
Processing project timob (1990, 180) ...
Processing project tistud (2870, 168) ...


In [7]:
all_changelogs.head()

Unnamed: 0,field,fieldtype,from,fromString,to,toString,author,created,key
0,status,jira,10000.0,To Do,3.0,In Progress,jvalkeal,2016-03-03T18:40:53.171+0000,XD-3753
1,Pull Request URL,custom,,,,https://github.com/spring-projects/spring-xd/p...,jvalkeal,2016-03-03T18:41:19.429+0000,XD-3752
2,status,jira,3.0,In Progress,10006.0,In PR,jvalkeal,2016-03-03T18:41:19.429+0000,XD-3751
3,summary,jira,,Can completely remove module after putting pro...,,Can completely remove custom module after putt...,aliiqbal,2016-02-29T10:00:55.086+0000,XD-3753
4,issuetype,jira,8.0,Story,1.0,Bug,aliiqbal,2016-02-29T10:01:41.067+0000,XD-3752


In [8]:
all_changelogs.to_csv("jiradataset_changelog.csv", sep=',', encoding='utf-8', doublequote = True, index=False)

# Getting developer info

In [None]:
project_names = ['xd', 'dnn', 'apstud', 'mesos', 'mule', 'nexus', 'timob', 'tistud' ]

allnames = pd.DataFrame()

for p in project_names:
    dataset = project_data[p]

    # assignees 
    assignee_name = {
        "name" : dataset['fields.assignee.name'],
        "displayName" : dataset['fields.assignee.displayName'],
        "emailAddress" : dataset['fields.assignee.emailAddress'] if 'fields.assignee.emailAddress' in dataset.columns else '',
        "timeZone" : dataset['fields.assignee.timeZone' ] if 'fields.assignee.timeZone' in dataset.columns else '',
    }
    assignee_name['project'] = p
    assignee_name['role'] = "assignee"
    
    aname = pd.DataFrame(assignee_name).drop_duplicates()
    
    # get all the creators
    creator_name = {
    "name": dataset['fields.creator.name'] ,
    "displayName": dataset['fields.creator.displayName'],
    "emailAddress" : dataset['fields.creator.emailAddress'] if 'fields.creator.emailAddress' in dataset.columns else '',
    "timeZone" : dataset['fields.creator.timeZone'] if 'fields.creator.timeZone' in dataset.columns else ''     
    }
    creator_name['project'] = p
    creator_name['role'] = "creator"
    
    cname = pd.DataFrame(creator_name).drop_duplicates()
    
    # get all the reporters
    reporter_name = {
    "name": dataset['fields.reporter.name'] ,
    "displayName": dataset['fields.reporter.displayName'],
    "emailAddress" : dataset['fields.reporter.emailAddress'] if 'fields.reporter.emailAddress' in dataset.columns else '',
    "timeZone" : dataset['fields.reporter.timeZone'] if 'fields.reporter.timeZone' in dataset.columns else ''     
    }
    reporter_name['project'] = p
    reporter_name['role'] = "reporter"
    
    rname = pd.DataFrame(reporter_name).drop_duplicates()
    
    allnames = pd.concat([allnames, aname, cname, rname])

print len(allnames)
allnames.head()