# Reading the dataset

In [40]:
import pandas as pd
import numpy as np

from __future__ import division 

# reading issues
df = pd.read_csv("jiradataset_issues.csv")
# Reading the changelog
changelog = pd.read_csv("jiradataset_changelog.csv")

print 'Dataset size: {0}'.format(len(df))

Dataset size: 15155


In [41]:
summary = df.pivot_table(index='project', columns=['fields.issuetype.name'], values='key', aggfunc='count', fill_value=0, margins=True)
summary

fields.issuetype.name,Bug,Documentation,Epic,Improvement,New Feature,Patch submission,Story,Sub-task,Task,Technical Debt,Technical task,Wish,All
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
apstud,456,0,7,159,1,0,202,0,0,0,61,0,886
dnn,1129,0,0,315,10,0,278,92,70,0,0,0,1894
mesos,517,82,9,377,0,0,21,0,462,0,0,4,1472
mule,595,0,2,252,121,2,33,2,274,0,0,0,1281
nexus,705,0,0,302,0,0,31,0,25,8,0,0,1071
timob,1143,0,17,268,218,0,289,34,0,0,21,0,1990
tistud,1450,0,0,536,37,0,618,20,0,0,209,0,2870
xd,598,0,111,307,0,0,2590,0,0,0,85,0,3691
All,6593,82,146,2516,387,2,4062,148,831,8,376,4,15155


# Pre-processing

0 - Remove all the issues whose assignee is null

In [42]:
xd1 = df[(df['fields.assignee.name'].notnull())]

print len(xd1)

12586


1- Story points must have been assigned once and never updated afterward. In fact, if the story points estimate gets updated, it may mean that the initial version of the issue report had misleading information, which would confuse the classifier. This explains why we filter out such issue reports. (Porru et al.)

In [43]:
# Filtering all the user stories that have been updated in the story points field
print "Original size: {0}".format(len(xd1))

remove = changelog[((changelog['field'] == 'Story Points') | ( changelog['field'] == 'Actual Story Points' )
                       | ( changelog['field'] == 'Story Size' ) | ( changelog['field'] == 'QA Story Points' ) 
                       | ( changelog['field'] == 'Effort points' ) | ( changelog['field'] == 'Value/Effort' )
                       | ( changelog['field'] == 'Effort' ) | ( changelog['field'] == 'Points' ) )  
                       & (changelog['fromString'].notnull()) ]

xd2 = xd1[ ~xd1['key'].isin(remove['key']) ]

print "After removing: {0} ({1:4.2f}%)".format(len(xd2), len(xd2)/len(xd1)*100)

Original size: 12586
After removing: 12357 (98.18%)


2- The issue must be addressed. We consider an issue addressed when its Status is set to Closed (or similar, e.g. Fixed, Completed) and its resolution field is set to Fixed (or similar, e.g. Done, Completed). Note that fields such as Story Points and Description may be adjusted or updated at any given time. However, once the issue is addressed updates rarely happen. For instance, in the industrial project this event happens for less than 4% (49/1368) of the issues. Here as for the other projects, we filter out issue reports not ad- dressed, because they are likely to be unstable, hence they might confuse the classifier.

In [44]:
xd3 = xd2[((xd2['fields.status.name'] == 'Done') | (xd2['fields.status.name'] == 'Closed') 
           | (xd2['fields.status.name'] == 'Resolved') | (xd2['fields.status.name'] == 'Accepted') ) 
          & ((xd2['fields.resolution.name'] == 'Complete') | (xd2['fields.resolution.name'] == 'Fixed') 
          | (xd2['fields.resolution.name'] == 'Done') | (xd2['fields.resolution.name'] == 'Resolved') 
          | (xd2['fields.resolution.name'] == 'Completed'))]

print "After removing: {0} ({1:4.2f}%)".format(len(xd3), len(xd3)/len(xd1)*100)

After removing: 10257 (81.50%)


3- Once the story points are assigned, the informative fields of the issue (i) must be already set and (ii) their value must not have been changed afterward. We define informative fields: Issue Type, Description, Summary, and Component/s. We filter out issues whose informative fields are updated after story points initialization because they, again, are likely to represent unstable issues.

In [45]:
# (i) the informative fields of the issue must be already set

# check if the fields are null or empty
xd4 = xd3[(xd3['fields.issuetype.name'] != '') &
    (xd3['fields.description'].notnull()) &
    (xd3['fields.summary'].notnull())]

# only US with components
keys = []
for i in range(len(xd4)):
    components = xd4.iloc[i]['fields.components']
    
    if (components != '[]'):
        keys.append(xd4.iloc[i]['key'])

len(keys)

xd5 = xd4[xd4['key'].isin(keys)]

print "After removing: {0} ({1:4.2f}%)".format(len(xd5), len(xd5)/len(xd1)*100)

After removing: 7280 (57.84%)


In [46]:
# We filter out issues whose informative fields are updated after story points initialization
#  ['fields.issuetype.name', 'fields.description', 'fields.summary', 'fields.components']

# get story points initialization date

sp = changelog[((changelog['field'] == 'Actual Story Points') | (changelog['field'] == 'Story Points')) 
          & (changelog['fromString'].isnull())]

ifields = changelog[ (changelog['field'] == 'issuetype') |
                    (changelog['field'] == 'description') |
                    (changelog['field'] == 'summary') |
                    (changelog['field'] == 'Component') ]

to_remove = []
for i in range(len(ifields)):
    key = ifields.iloc[i]['key']
    #print key
    
    original_date = pd.to_datetime(xd5[xd5.key == key]['fields.created'])
    
    #print original_date
    
    #print sp[ sp.key == key ]
    # story points initialization date
    spinit = pd.to_datetime(sp[ sp.key == key ].created)
    
    # update date of the informative field
    updatedate = pd.to_datetime(ifields.iloc[i]['created'])
    
    if not spinit.empty:
        if updatedate > pd.to_datetime(spinit.iloc[0]):
            to_remove.append(key)
    elif not original_date.empty:
        if updatedate > pd.to_datetime(original_date.iloc[0]):
            to_remove.append(key)

xd6 = xd5[~xd5['key'].isin(to_remove)]

print "After removing: {0} ({1:4.2f}%)".format(len(xd6), len(xd6)/len(xd1)*100)

After removing: 7222 (57.38%)


4- Take the user stories which have points according to the fibonacci series

In [47]:
fibonacci = [0.5, 1, 2, 3, 5, 8, 13, 20, 40, 100]

xd7 = xd6[ xd6['storypoints'].isin(fibonacci)]

print "After removing: {0} ({1:4.2f}%)".format(len(xd7), len(xd7)/len(xd1)*100)

After removing: 6757 (53.69%)


## Choosing the filter

In [48]:
# Choose the filters we want to consider xd0, xd1, xd3... xd6
xdf = xd7

print 'Filtered dataset size: {0}'.format(len(xdf))

Filtered dataset size: 6757


# Adding new features
We add features from three categories: features from the developer, from the issues, and from text.

## Features from the developer
Developers' features depend on the dataset since they are mostly percentages (the total number of issues for the project is used). Then, I defined the functions and call them later. 

In [49]:
# Reporter reputation 

def get_reputation(developer, dataset):
    opened = len(dataset[dataset['fields.creator.name'] == developer])
    opened_and_fixed = len(dataset[(dataset['fields.creator.name'] == developer) 
                            & ((dataset['fields.status.name'] == 'Done') | (dataset['fields.status.name'] == 'Closed') 
           | (dataset['fields.status.name'] == 'Resolved') | (dataset['fields.status.name'] == 'Accepted') ) 
                            & (dataset['fields.assignee.name'] == developer)])
    return opened_and_fixed/(opened+1)   

In [50]:
def get_reputations(dataset):
    devs = dataset['fields.creator.name'].unique()
    devs = np.append(devs, dataset['fields.assignee.name'].unique())
    devs = np.append(devs, dataset['fields.reporter.name'].unique())

    # remove dupplicates
    devs = np.unique(devs)

    print "Total number of devs: ", len(devs)

    reputations = []
    for d in devs:
        reputations.append(get_reputation(d, dataset))

    reputations_df = pd.DataFrame({"developer": devs, "reputation": reputations})
    return reputations_df

#reputations_df[reputations_df.reputation > 0].sort_values(['reputation'], ascending=False).head()

In [51]:
# Total developer workload
from __future__ import division 
    
#     
def get_dev_workload(developer, dataset, percentual=True):
    if percentual:
        df = dataset[(dataset['fields.assignee.name'] == developer)]
        return len(df)/len(dataset)
    else:
        df = dataset[(dataset['fields.assignee.name'] == developer)]
        return len(df)

def get_devs_workload(dataset, percentual=True):
    ws = []
    
    devs = dataset['fields.creator.name'].unique()
    devs = np.append(devs, dataset['fields.assignee.name'].unique())
    devs = np.append(devs, dataset['fields.reporter.name'].unique())

    # remove dupplicates
    devs = np.unique(devs)
    
    for d in devs:
        ws.append(get_dev_workload(d, dataset, percentual))
    return pd.DataFrame({"developer":devs, "workload": ws})

def get_workload(dataset, developer="", percentual=True):
    if developer == "":
        return get_devs_workload(dataset, percentual)
    else:
        return get_dev_workload(developer, dataset, percentual)

In [52]:
# current workload 
def get_current_workload(dataset):
    undone_issues = dataset[((dataset['fields.status.name'] == 'Done') | (dataset['fields.status.name'] == 'Closed') 
           | (dataset['fields.status.name'] == 'Resolved') | (dataset['fields.status.name'] == 'Accepted') )]
    grouped = undone_issues.groupby('fields.assignee.name').size().reset_index(name='workload')
    grouped['workload'] = grouped['workload']/sum(grouped['workload'])
    #developers = developers.merge(grouped, on='fields.assignee.name')
    #developers.head()
    grouped.columns = ['developer', 'current_workload']
    return grouped

In [53]:
# Number of developer's comments
def get_comment_number(dataset):
    
    ch = changelog[changelog['key'].isin(dataset['key'])]
    comments_times = ch[ (ch['field'] == 'Comment')].groupby(['author']).size()

    comments_times = comments_times.reset_index()
    comments_times['comments'] = comments_times[0]/sum(comments_times[0]) 

    comments_times.columns = ['developer', 'comment_absolute', 'comments_relative']
    #print "number of developers with comments: ", len(comments_times)
    return comments_times

In [54]:
def get_velocity(dataset):
    velocity = dataset[['fields.assignee.name', 'storypoints']].groupby(['fields.assignee.name']).sum()
    
    velocity = velocity.reset_index()
    velocity.columns = ['developer', 'velocity']
    velocity['velocity'] = velocity['velocity'] / sum(velocity['velocity'])
    return velocity
    
#velocity.sort_values(ascending=False).head()

## Features from the issues
The issue features do not depend on the total number of issues of the project, so they can be computed for the entire dataset.

In [55]:
# Discussion time
def get_discussiontime(dataset):
    discussiontime = pd.to_datetime(dataset['fields.resolutiondate']).subtract(pd.to_datetime(dataset['fields.created']))
    return discussiontime

In [56]:
# Number of times the issue was reopened
reopened_times = changelog[ (changelog['field'] == 'status') 
            & (changelog['fromString'] == 'Done') 
            & (changelog['toString'] != 'Done')].groupby(['key']).size()
reopened_times = reopened_times.reset_index()
reopened_times.columns = ['key', 'reopened_times']

In [57]:
# Number of times the priority was changed
priority_times = changelog[ changelog['field'] == 'priority' ].groupby(['key']).size()
priority_times = priority_times.reset_index()
priority_times.columns = ['key', 'priority_times']

In [58]:
# Number of times the fix version was changed
fixversion_times = changelog[ changelog['field'] == 'Fix Version' ].groupby(['key']).size()
fixversion_times = fixversion_times.reset_index() 
fixversion_times.columns = ['key', 'fixversion_times']

In [59]:
# Number of fix versions
d = []
for i in range(len(xdf['fields.fixVersions'])):
    d.append({ 'key' : xdf['key'].iloc[i] , 'fix_versions' : len(pd.Series(xdf['fields.fixVersions'].iloc[i]))})
    
fix_versions = pd.DataFrame(d)

In [60]:
# Number of affect versions
# at least one version is affected
d = []
for i in range(len(xdf['fields.versions'])):
    d.append({ 'key' : xdf['key'].iloc[i], 
              'affect_versions' : 1 if len(pd.Series(xdf['fields.versions'].iloc[i])) == 0 else len(pd.Series(xdf['fields.versions'].iloc[i]))})
    
affect_versions = pd.DataFrame(d)

### Features from component and issue type

#### Issue type dummies

In [61]:
# issue type 
issue_type = pd.get_dummies(xdf[['key', 'fields.issuetype.name']], columns=['fields.issuetype.name'])

#### Components dummies

In [62]:
xdf['fields.components'] = xdf['fields.components'].apply(lambda x: [v.replace('[', '').replace(']', '').strip() for v in x.split(',')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [64]:
#new_df.head()
components = pd.get_dummies(xdf['fields.components'].apply(pd.Series).stack()).sum(level=0)
components['key'] = xdf['key']
components.head()

Unnamed: 0,360,Acceptance Testing,Acegi,Admin - Event Viewer,Admin - Extensions,Admin - File Manager,Admin - Google Analytics,Admin - Languages,Admin - Newsletters,Admin - Pages,...,security,slave,statistics,stout,technical debt,test,testing,tests,webui,key
37,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,XD-3716
44,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,XD-3709
62,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,XD-3691
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,XD-3690
68,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,XD-3685


In [65]:
# components + issue_type
print len(issue_type)
print len(components)

components_issuetype = pd.merge(issue_type, components, on='key',  how='left')

6757
6757


### Putting together all the features

In [66]:
# Issue-related features
usfeatures = pd.DataFrame(xdf['key'])

usfeatures['discussion_time'] = pd.to_datetime(xdf['fields.resolutiondate']).subtract(pd.to_datetime(xdf['fields.created']))
usfeatures = pd.merge(usfeatures, reopened_times, on='key', how='left')
usfeatures = pd.merge(usfeatures, priority_times, on='key', how='left')
usfeatures = pd.merge(usfeatures, fixversion_times, on='key', how='left')
usfeatures = pd.merge(usfeatures, fix_versions, on='key', how='left')
usfeatures = pd.merge(usfeatures, affect_versions, on='key', how='left')
#usfeatures = pd.merge(usfeatures, context[['key', 'context_characters', 'context_code_characters']], on='key', how='left')
usfeatures = usfeatures.fillna(0)
usfeatures['discussion_time'] = usfeatures['discussion_time'].dt.total_seconds()

### Full issue features ( issue + components + issuetype )

In [67]:
print len(usfeatures)
print len(components_issuetype)
fullissuefeatures = pd.merge(usfeatures, components_issuetype, on='key', how='left')

6757
6757


## Text features
I create a new variable context to store only the textual info.

In [68]:
# Summary and description merged into one text (Porru 2014)

context = xdf[['key', 'fields.summary', 'fields.description']].copy()
context["context"] = context["fields.summary"] + ". " + context["fields.description"]

print len(context)

context.head()

6757


Unnamed: 0,key,fields.summary,fields.description,context
37,XD-3716,Support Configuring the RabbitMessageBus Messa...,http://stackoverflow.com/questions/34053997/pa...,Support Configuring the RabbitMessageBus Messa...
44,XD-3709,Duplicate MBean Names With router Sink,"For some reason, the Integration {{MBeanExport...",Duplicate MBean Names With router Sink. For so...
62,XD-3691,Ensure Job definitions are escaped in UI,If using the definition <aaa || bbb> where the...,Ensure Job definitions are escaped in UI. If u...
63,XD-3690,"Improve ""Server Configuration - Database Confi...",Make it more clear what drivers need to be cop...,"Improve ""Server Configuration - Database Confi..."
68,XD-3685,Job Definitions page fails to display definiti...,In this scenario we created 30 jobs that can b...,Job Definitions page fails to display definiti...


In [69]:
# Separate natural language and the code in context
import re

for ix, line in context.iterrows():
    m = re.search('{code}(.*){code}', line.context, flags=re.DOTALL)
    if m:
        context.loc[ix, 'context_code'] = line.context[m.start(0):m.end(0)]
        context.loc[ix, 'context'] = line.context[:m.start(0)] + line.context[m.end(0):]
    else:
        context.loc[ix, 'context_code'] = ""
        
    context.loc[ix, 'context'] = re.sub(r"\s+", " ", context.loc[ix, 'context'])
    context.loc[ix, 'context_code'] = re.sub(r"\s+", " ", context.loc[ix, 'context_code'])

In [None]:
# remove stop words
"""
from nltk.corpus import stopwords

for ix, line in context.iterrows():
    word_list = re.sub("[^\w]", " ",  context.loc[ix, 'context']).split()
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    context.loc[ix, 'context'] = ' '.join(word for word in filtered_words)
    
    word_list = re.sub("[^\w]", " ",  context.loc[ix, 'context_code']).split()
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    
    context.loc[ix, 'context_code'] = ' '.join(word for word in filtered_words)
"""

In [70]:
# Number of characters in the code
context['context_code_characters'] = context['context_code'].str.len()

# Number of characters in context
context['context_characters'] = context['context'].str.len()

Thi is just an example on how to extract the terms and weights from TfidfVectorizer

http://www.ultravioletanalytics.com/2016/11/18/tf-idf-basics-with-pandas-scikit-learn/

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(min_df=1, max_df=.5, stop_words='english', ngram_range=(1,2))
tvec_weights = tvec.fit_transform(context['context'])
print tvec_weights.shape

weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()

weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

(6757, 248927)


Unnamed: 0,term,weight
213226,studio,0.013522
174963,project,0.012305
34386,android,0.011513
224220,titanium,0.010896
35984,app,0.010117
89882,error,0.010009
148531,module,0.009492
98440,file,0.008973
30383,add,0.008911
153329,new,0.008896


# Machine learning

## Metrics

In [72]:
# MMRE : difference between the actual effort and the estimated effort divided by the actual effort
import numpy as np
from numpy import inf

def mmre(labels, predictions):
    assert len(labels) == len(predictions)
    
    mre = np.abs(labels - predictions) / labels
    mre[mre == inf] = 0
    return np.sum(mre) / len(labels)

## Cross-validation SVM

In [77]:
# Obtaining predictions by cross-validation

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import metrics

from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC 
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

def SVM(X, Y, comment, results):
    clf = Pipeline([
      #('feature_selection', SelectFromModel(LinearSVC())),
      ('classification', LinearSVC())
    ])
    scores = cross_val_score(clf, X, Y, cv=10)

    #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    predicted = cross_val_predict(clf, X, Y, cv=10)
    
    #print "len of zeros in actual: ", len(Y[Y==0])
    #print "len of zeros in predicted: ", len(predicted[predicted==0])   
    
    #print 'Predicted: ', len(predicted)
    #print "MMRE ", mmre(Y, predicted)
    
    diff = np.abs(Y.astype(float) - predicted.astype(float))
    mre = diff / Y.astype(float)
    
    #print "Average of MRE: ", np.sum(mre)/len(Y)
    
    both = pd.DataFrame({'Actual': Y, 'Predicted': predicted, 'diff': diff, 'mre': mre })
    
    #print "ACC: ", len(diff[diff==0])/len(diff)
    
    result = {
        'Classifier': comment, 
        'Rows': X.shape[0], 
        'Features': X.shape[1],
        'Accuracy': scores.mean(),
        'Accuracy SD': scores.std()*2,
        'MAE' : metrics.mean_absolute_error(Y.astype(float), predicted.astype(float)),
        'MMRE': mmre(Y.astype(float), predicted.astype(float)),
    }
    
    results = results.append(result, ignore_index=True)
    return results

# Run SVM for all the projects

In [78]:
"""
For all the projects...
"""
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

fullresults = pd.DataFrame()

for p in xdf['project'].unique():
    results = pd.DataFrame()
    
    # Set the project
    df = xdf[xdf['project'] == p]
    
    # Cross validation is not working if there are few instances
    if (len(df) < 14):
        continue
    print "Project: ", p
    
    # Compute the dev features
    print "Getting reputations..."
    reps = get_reputations(df)
    print "Getting workload..."
    workload_df = get_workload(df)
    print "Getting current workload..."
    current_workload = get_current_workload(df)
    print "Getting comments..."
    comments = get_comment_number(df)
    print "Getting velocity..."
    velocity = get_velocity(df)
    
    # Put together all the dev features
    devfeatures = pd.merge(reps, workload_df, on='developer', how='left')
    devfeatures = pd.merge(devfeatures, current_workload, on='developer', how='left')
    devfeatures = pd.merge(devfeatures, comments[['developer', 'comments_relative']], on='developer', how='left')
    devfeatures = pd.merge(devfeatures, velocity, on='developer', how='left')

    devfeatures = devfeatures.fillna(0)
    
    print "Getting text features..."
    # Text features using the context
    # TF-IDF
    # min_df = 0.001
    ctx = context[context['key'].isin(df['key'])]
    
    print "ctx size", len(ctx)
    #print ctx
    
    v = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', min_df=.0025, max_df=.1, stop_words='english')
    x = v.fit_transform(ctx['context'])
    #print "X shape: ", (x.shape)

    if ( np.all(ctx['context_code']<>'') ):
        v2 = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', min_df=.0025, max_df=.1, stop_words='english')
        y = v2.fit_transform(ctx['context_code'])
        textfeatures = sp.hstack((x, y))
    else:
        textfeatures = x
    #print(y.shape)

    #textfeatures = sp.hstack((x, y))
    #assert x.shape[1] + y.shape[1] == textfeatures.shape[1]
    print "Text features: ", textfeatures.shape
    
    print "Getting issue features..."
    # Issue+Dev features
    
    # link btw key and developer
    usdev = df[['key', 'fields.assignee.name']]
    usdev.columns = ['key', 'developer']
    
    us_dev = pd.merge(usdev, devfeatures, on='developer', how='left')
    #print "Dev features: ", us_dev.shape
    
    issue_features = fullissuefeatures[fullissuefeatures['key'].isin(df['key'])]
    
    #print "Issue features: ", issue_features.shape
    
    dev_issue_features = pd.merge(us_dev, fullissuefeatures, on='key', how='left')
    dev_issue_features = dev_issue_features.fillna(0)
    #print "Dev+issue features: ", dev_issue_features.shape
    
    # text + dev + issue features
    text_dev_issue_features = sp.hstack((dev_issue_features.drop(['key', 'developer'], axis=1), textfeatures))
    print "Dev+text+issues features: ", text_dev_issue_features.shape

    # Issue + text features
    text_issue_features = sp.hstack((issue_features.drop(['key'], axis=1), textfeatures))
    #print "Text+Issue features", text_issue_features.shape
    
    # dev + text features
    text_dev_features = sp.hstack((us_dev.drop(['key', 'developer'], axis=1), textfeatures))
    #print "Text+Dev features", text_dev_features.shape
    
    print "Training SVMs..."
    # Train the SVM
    Y = df['storypoints'].astype(str)

    results = SVM(text_dev_issue_features, Y, "Issue+Dev+Text", results)
    results = SVM(text_dev_features, Y, "Text+Dev", results)
    results = SVM(issue_features.drop(['key'], axis=1), Y, "Issue", results)
    results = SVM(textfeatures, Y, "Text", results)
    results = SVM(us_dev.drop(['key', 'developer'], axis=1), Y, "Dev", results)
    results = SVM(dev_issue_features.drop(['key','developer'], axis=1), Y, "Dev+Issue", results)
    results = SVM(text_issue_features, Y, "Text+Issue", results)
    
    results['project'] = p
    fullresults = fullresults.append(results, ignore_index=True)
    
    #print results.sort_values(['Accuracy', 'MMRE'], ascending=[0,1])
    print "Done."
    print

Project:  xd
Getting reputations...
Total number of devs:  63
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 587
Text features:  (587, 4964)
Getting issue features...
Dev+text+issues features:  (587, 5320)
Training SVMs...
Done.

Project:  dnn
Getting reputations...
Total number of devs:  114
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 586
Text features:  (586, 5604)
Getting issue features...
Dev+text+issues features:  (586, 5960)
Training SVMs...
Done.

Project:  apstud
Getting reputations...
Total number of devs:  116
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 386
Text features:  (386, 18160)
Getting issue features...
Dev+text+issues features:  (386, 18516)
Training SVMs...




Done.

Project:  mesos
Getting reputations...
Total number of devs:  87
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 555
Text features:  (555, 5243)
Getting issue features...
Dev+text+issues features:  (555, 5599)
Training SVMs...




Done.

Project:  mule
Getting reputations...
Total number of devs:  91


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 772
Text features:  (772, 4341)
Getting issue features...
Dev+text+issues features:  (772, 4697)
Training SVMs...
Done.

Project:  nexus
Getting reputations...
Total number of devs:  61
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 539
Text features:  (539, 7853)
Getting issue features...
Dev+text+issues features:  (539, 8209)
Training SVMs...




Done.

Project:  timob
Getting reputations...
Total number of devs:  275
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 1312
Text features:  (1312, 3336)
Getting issue features...
Dev+text+issues features:  (1312, 3692)
Training SVMs...




Done.

Project:  tistud
Getting reputations...
Total number of devs:  163
Getting workload...
Getting current workload...
Getting comments...
Getting velocity...
Getting text features...
ctx size 2020
Text features:  (2020, 2662)
Getting issue features...
Dev+text+issues features:  (2020, 3018)
Training SVMs...
Done.



# Results

In [79]:
print "Accuracy:"
print "Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Dev', 'Accuracy'])
print "Text: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text', 'Accuracy'])
print "Text+Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text+Dev', 'Accuracy'])
print
print "MMRE:"
print "Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Dev', 'MMRE'])
print "Text: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text', 'MMRE'])
print "Text+Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text+Dev', 'MMRE'])
print
print "MAE:"
print "Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Dev', 'MAE'])
print "Text: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text', 'MAE'])
print "Text+Dev: ", np.mean(fullresults.loc[fullresults['Classifier'] == 'Text+Dev', 'MAE'])

Accuracy:
Dev:  0.385366850943
Text:  0.352134860868
Text+Dev:  0.360825380758

MMRE:
Dev:  0.581136730256
Text:  0.668009697402
Text+Dev:  0.642347672201

MAE:
Dev:  1.91528126523
Text:  2.04129752721
Text+Dev:  2.01827134062


In [80]:
fr = fullresults[(fullresults['Classifier'] == 'Dev') | (fullresults['Classifier'] == 'Text') | (fullresults['Classifier'] == 'Text+Dev') ]
rrr = fr.pivot( index='project', columns='Classifier')[['Accuracy', 'MMRE', 'MAE']]
rrr.loc['Average']= rrr.mean()

rrr.round(3)

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,MMRE,MMRE,MMRE,MAE,MAE,MAE
Classifier,Dev,Text,Text+Dev,Dev,Text,Text+Dev,Dev,Text,Text+Dev
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
apstud,0.348,0.343,0.348,0.852,0.772,0.768,3.738,3.671,3.681
dnn,0.504,0.467,0.471,0.335,0.364,0.37,0.661,0.719,0.718
mesos,0.279,0.324,0.322,0.634,0.552,0.559,1.476,1.432,1.431
mule,0.333,0.244,0.282,0.848,1.141,0.967,2.74,3.096,2.916
nexus,0.572,0.515,0.547,0.298,0.373,0.342,0.473,0.509,0.491
timob,0.334,0.26,0.267,0.669,0.861,0.809,2.371,2.873,2.794
tistud,0.406,0.379,0.373,0.444,0.519,0.526,1.931,2.116,2.155
xd,0.307,0.285,0.277,0.57,0.763,0.797,1.932,1.913,1.961
Average,0.385,0.352,0.361,0.581,0.668,0.642,1.915,2.041,2.018


# Random guessing

In [81]:
import random
from scipy import stats

rs = pd.DataFrame()

for p in xdf['project'].unique():
    
    df = xdf[xdf['project'] == p]
    
    df.loc[df['storypoints'] == 0, 'storypoints'] = 0.5
    
    mae_mean = np.sum(np.abs(df['storypoints'] - df['storypoints'].mean()))/len(df)
    #mre_mean = np.sum(np.abs(df['storypoints'] - df['storypoints'].mean())/df['storypoints'])/len(df)
    
    d1 = {
        "project": p,
        "Classifier": "Mean",
      #  "MMRE": mre_mean,
        "MAE": mae_mean,
        "Accuracy": None
    }
    rs = rs.append(d1, ignore_index=True)
    
    mae_median = np.sum(np.abs(df['storypoints'] - df['storypoints'].median()))/len(df)
    #mre_median = np.sum(np.abs(df['storypoints'] - df['storypoints'].median())/df['storypoints'])/len(df)
    acc_median = len(df[df['storypoints'] == df['storypoints'].median()])/len(df)
    
    d2 = {
        "project": p,
        "Classifier": "Median",
       # "MMRE": mre_median,
        "MAE": mae_median,
        "Accuracy": acc_median
    }
    rs = rs.append(d2, ignore_index=True)
    # Random Guess baseline 
    
    rguess = []
    for i in range(len(df)):
        rguess.append( random.choice(fibonacci) )

    mae_rguess = np.sum(np.abs(df['storypoints'] - rguess))/len(df)
    #mre_rguess = np.sum(np.abs(df['storypoints'] - rguess)/df['storypoints'])/len(df)
    acc_rguess = len(df[df['storypoints'] == rguess])/len(df)
    
    d3 = {
        "project": p,
        "Classifier": "Random Guess",
       # "MMRE": mre_rguess,
        "MAE": mae_rguess,
        "Accuracy": acc_rguess
    }
    rs = rs.append(d3, ignore_index=True)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [82]:
rsp = rs.pivot(index='project', columns='Classifier')
rsp = rsp.drop(columns=[('Accuracy', 'Mean')])
rsp.round(3)

Unnamed: 0_level_0,Accuracy,Accuracy,MAE,MAE,MAE
Classifier,Median,Random Guess,Mean,Median,Random Guess
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
apstud,0.334197,0.0958549,3.84,3.642,17.807
dnn,0.503413,0.0887372,0.759,0.661,16.38
mesos,0.232432,0.111712,1.427,1.425,16.947
mule,0.255181,0.101036,2.586,2.57,17.798
nexus,0.478664,0.0834879,0.593,0.495,19.72
timob,0.317073,0.10747,2.551,2.413,18.423
tistud,0.405941,0.0985149,2.161,1.931,16.774
xd,0.240204,0.102215,1.807,1.716,16.596


In [83]:
# rs + rrr
frr = pd.concat([rs,fr])[[ 'Classifier', 'project', 'Accuracy', 'MAE' ]]

frrr = frr.pivot(index='project', columns='Classifier')

#frrr.loc['Average'] = frr.pivot(index='project', columns='Classifier').mean()

frrr = frrr.astype(float)

frrr

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,MAE,MAE,MAE,MAE,MAE,MAE
Classifier,Dev,Mean,Median,Random Guess,Text,Text+Dev,Dev,Mean,Median,Random Guess,Text,Text+Dev
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
apstud,0.348279,,0.334197,0.095855,0.342798,0.348226,3.738342,3.840372,3.642487,17.806995,3.670984,3.681347
dnn,0.503693,,0.503413,0.088737,0.467456,0.470961,0.661263,0.758515,0.661263,16.379693,0.719283,0.717577
mesos,0.279059,,0.232432,0.111712,0.323799,0.322283,1.475676,1.426826,1.425225,16.946847,1.432432,1.430631
mule,0.333043,,0.255181,0.101036,0.244141,0.281963,2.739637,2.586392,2.569948,17.797927,3.095855,2.915803
nexus,0.571589,,0.478664,0.083488,0.514946,0.546889,0.473098,0.592666,0.495362,19.719852,0.509276,0.490724
timob,0.334142,,0.317073,0.10747,0.25986,0.26662,2.371189,2.551204,2.41311,18.423399,2.873095,2.793826
tistud,0.406459,,0.405941,0.098515,0.378833,0.372897,1.931188,2.160927,1.931188,16.774257,2.116337,2.155446
xd,0.306672,,0.240204,0.102215,0.285247,0.276764,1.931857,1.806558,1.715503,16.596252,1.913118,1.960818


In [84]:
d = {
    "Dev" : (1 - frrr['MAE']['Dev']/rsp['MAE']['Random Guess'])*100,
    "Text" : (1 - frrr['MAE']['Text']/rsp['MAE']['Random Guess'])*100,
    "Text+Dev" : (1 - frrr['MAE']['Text+Dev']/rsp['MAE']['Random Guess'])*100,
    "Median" : (1 - frrr['MAE']['Median']/rsp['MAE']['Random Guess'])*100,
    "Mean" : (1 - frrr['MAE']['Mean']/rsp['MAE']['Random Guess'])*100
    }
sa = pd.DataFrame(d)

In [85]:
sa

Unnamed: 0_level_0,Dev,Mean,Median,Text,Text+Dev
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
apstud,79.006329,78.433354,79.544628,79.384593,79.326398
dnn,95.962911,95.369175,95.962911,95.608689,95.619107
mesos,91.292329,91.580583,91.590027,91.547499,91.558131
mule,84.606987,85.468018,85.560408,82.605531,83.617176
nexus,97.600903,96.99457,97.488005,97.417443,97.511525
timob,87.129471,86.15237,86.90193,84.405188,84.835447
tistud,88.487192,87.117597,88.487192,87.383426,87.150277
xd,88.35968,89.114661,89.663313,88.472593,88.185178


In [86]:
w = pd.concat([frrr, sa], axis=1)

In [87]:
w[('SA','Dev')] = w['Dev']
w[('SA','Text')] = w['Text']
w[('SA','Text+Dev')] = w['Text+Dev']
w[('SA','Mean')] = w['Mean']
w[('SA','Median')] = w['Median']

In [88]:
evalresults = w[[('Accuracy', 'Dev'),('Accuracy', 'Text'),('Accuracy', 'Text+Dev'),('Accuracy', 'Median'),('Accuracy', 'Random Guess'),
  ('MAE', 'Dev'),('MAE', 'Text'),('MAE', 'Text+Dev'),('MAE', 'Mean'),('MAE', 'Median'),('MAE', 'Random Guess'),
  ('SA','Dev'),('SA', 'Text'),('SA', 'Text+Dev'),('SA', 'Mean'),('SA', 'Median')
  ]]

In [89]:
evalresults.loc['Average'] = evalresults.mean()
evalresults.round(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,MAE,MAE,MAE,MAE,MAE,MAE,SA,SA,SA,SA,SA
Unnamed: 0_level_1,Dev,Text,Text+Dev,Median,Random Guess,Dev,Text,Text+Dev,Mean,Median,Random Guess,Dev,Text,Text+Dev,Mean,Median
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
apstud,0.348,0.343,0.348,0.334,0.096,3.738,3.671,3.681,3.84,3.642,17.807,79.006,79.385,79.326,78.433,79.545
dnn,0.504,0.467,0.471,0.503,0.089,0.661,0.719,0.718,0.759,0.661,16.38,95.963,95.609,95.619,95.369,95.963
mesos,0.279,0.324,0.322,0.232,0.112,1.476,1.432,1.431,1.427,1.425,16.947,91.292,91.547,91.558,91.581,91.59
mule,0.333,0.244,0.282,0.255,0.101,2.74,3.096,2.916,2.586,2.57,17.798,84.607,82.606,83.617,85.468,85.56
nexus,0.572,0.515,0.547,0.479,0.083,0.473,0.509,0.491,0.593,0.495,19.72,97.601,97.417,97.512,96.995,97.488
timob,0.334,0.26,0.267,0.317,0.107,2.371,2.873,2.794,2.551,2.413,18.423,87.129,84.405,84.835,86.152,86.902
tistud,0.406,0.379,0.373,0.406,0.099,1.931,2.116,2.155,2.161,1.931,16.774,88.487,87.383,87.15,87.118,88.487
xd,0.307,0.285,0.277,0.24,0.102,1.932,1.913,1.961,1.807,1.716,16.596,88.36,88.473,88.185,89.115,89.663
Average,0.385,0.352,0.361,0.346,0.099,1.915,2.041,2.018,1.965,1.857,17.556,89.056,88.353,88.475,88.779,89.4
