# Reading the dataset

First, make sure you have:
* A MongoDB server already installed.
* A dump of the dataset loaded as a collection
* The MongoDB server running.



1. Install MongoDB server. Choose the version according to your OS. https://www.mongodb.com/download-center?jmp=nav#community

2. Unzip the mongo dump files (.json) into a directory

3. Restore/import the dump from the directory by using the command: 

	`mongorestore -d <database_name> <directory_backup>`

NOTE:
The data dump was done using:
	`mongodump -d <database_name> -o <directory_backup>`

In [1]:
import pandas as pd
from pymongo import MongoClient

from bson import json_util, ObjectId
from pandas.io.json import json_normalize
import json

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)
    
    sanitized = json.loads(json_util.dumps(list(cursor)))
    normalized = json_normalize(sanitized)
    
    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(normalized)
    
    # Delete the _id
    if no_id:
        del df['_id.$oid']

    #sf = gl.SFrame(df)
    return df

# Getting the data in csv format

In [2]:
project_names = ['xd', 'dnn', 'apstud', 'mesos', 'mule', 'nexus', 'timob', 'tistud' ]
project_data = {}

# reading from the db
for p in project_names:
    project_data[p] = read_mongo('jira', p)

In [16]:
all_projects = pd.DataFrame()
for p in project_names:
    df = project_data[p]
    
    print "Processing project {0} of shape {1} ...".format(p, df.shape)
    
    df0 = df[[
     'fields.assignee.name',
     'fields.components',
     'fields.created',
     'fields.creator.name',
     'fields.description', # User Story description
     'fields.fixVersions',
     'fields.issuetype.name', 
     'fields.issuetype.subtask', # true, false
     'fields.priority.name', # Minor, Major, Critical, Blocker
     'fields.reporter.name',
     'fields.resolution.description',
     'fields.resolution.name',
     'fields.resolutiondate',
     'fields.status.id',
     'fields.status.name',
     'fields.status.statusCategory.name',
     'fields.summary',
     'fields.updated',
     'fields.versions',
     'fields.watches.watchCount',
     u'key',
    ]].copy()
    
    # List of fields where the story points are stored:
    storypoints_fields = {
        'apstud':'customfield_10003',
        'dnn': 'customfield_10004',
        'mesos':'customfield_12310293',
        'mule':'customfield_10203',
        'timob':'customfield_10003',
        'tistud':'customfield_10003',
        'xd':'customfield_10142',
        'nexus': 'customfield_10132'
    }
    
    df0['storypoints'] = df[ 'fields.' + storypoints_fields[p] ]
    
    df0['project'] = p
    
    # transform components from json objet to list of string
    df0['fields.components'] = df0['fields.components'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
    df0['fields.versions'] = df0['fields.versions'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
    df0['fields.fixVersions'] = df0['fields.fixVersions'].apply(lambda x : [ v['name'] for v in x if v != [] ] )
  
    all_projects = all_projects.append(df0, ignore_index=True)

print "Done."

Processing project xd of shape (3691, 154) ...
Processing project dnn of shape (1894, 237) ...
Processing project apstud of shape (886, 157) ...
Processing project mesos of shape (1472, 172) ...
Processing project mule of shape (1281, 220) ...
Processing project nexus of shape (1071, 158) ...
Processing project timob of shape (1990, 180) ...
Processing project tistud of shape (2870, 168) ...
Done.


In [18]:
all_projects.to_csv("jiradataset_issues.csv", sep=',', encoding='utf-8', doublequote = True, index=False)

In [None]:
#import pandas as pd
#all_project = pd.read_csv("jiradataset_issues.csv")

# Getting the changelog

In [61]:
def get_changelog(data):
    changelog = pd.DataFrame()
    for i, item in data.iterrows():
        
        key = item['key']
        
        histories = pd.DataFrame(item['changelog.histories'])
        if not histories.empty:
            histories['author'] = histories['author'].apply(lambda x : x['name'] if 'name' in x.keys() else '') 
        
            for j, h in histories.iterrows():
                author = h['author']
                items = pd.DataFrame(h['items'])

                items['key'] = key
                items['created'] = h['created']
                items['author'] = author

                changelog = changelog.append(items, ignore_index=True)
            
    return changelog

In [None]:
all_changelogs = pd.DataFrame()
for p in project_names:
    df = project_data[p]
    
    print "Processing project {0} {1} ...".format(p, df.shape)
    
    changelog = get_changelog(df)
    
    changelog['project'] = p
    
    all_changelogs = all_changelogs.append(changelog, ignore_index=True)
print "Done."

Processing project xd (3691, 154) ...


In [None]:
all_changelogs.head()

In [None]:
all_changelogs.to_csv("jiradataset_changelog.csv", sep=',', encoding='utf-8', doublequote = True, index=False)