# Contagem* de modificações de campos por tipo do usuário (relator ou outros usuários)
\* Considerando apenas os 39 campos não *custom field*.

## Importações

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

with open('data\\processed_bugs_counting_changes_users_final.json') as input:
  bugs = json.load(input)

## Pré-processamento

In [2]:
bugs_with_total_changes = []
bugs_reporter = []
bugs_others = []

for bug in bugs:
  sum_changes_reporter = sum(bug['reporter'].values())
  sum_changes_others = sum(bug['others'].values())
  bugs_with_total_changes.append({ 'reporter': sum_changes_reporter, 'others': sum_changes_others })

  bugs_reporter.append({**bug['reporter']})
  bugs_others.append({**bug['others']})

df_total_changes = pd.DataFrame(bugs_with_total_changes)
df_reporter = pd.DataFrame(bugs_reporter)
df_others = pd.DataFrame(bugs_others)

bugs.clear()
bugs_with_total_changes.clear()
bugs_reporter.clear()
bugs_others.clear()

fields = df_reporter.columns.tolist()

## 1. Estatísticas

In [5]:
df_total_changes.describe()

Unnamed: 0,reporter,others
count,690817.0,690817.0
mean,3.1,9.15
std,6.74,12.07
min,0.0,0.0
25%,0.0,3.0
50%,2.0,6.0
75%,4.0,11.0
max,1480.0,1218.0


## 2. Estatísticas sobre a porcentagem de modificações por tipo de usuário

In [6]:
df_total_changes['Pct relator'] = (df_total_changes['reporter']/(df_total_changes['reporter'] + df_total_changes['others'])) * 100
df_total_changes['Pct others'] = (100 - df_total_changes['Pct relator'])

df_total_changes[['Pct relator', 'Pct others']].describe()

Unnamed: 0,Pct relator,Pct others
count,690817.0,690817.0
mean,27.18,72.82
std,27.13,27.13
min,0.0,0.0
25%,0.0,57.14
50%,20.0,80.0
75%,42.86,100.0
max,100.0,100.0


## 3. Os 10 campos com maior percentual de mudança por parte do relator

In [None]:

df_reporter_field_change_percents = pd.DataFrame({
  'field':  [],
  'percentage': []
})

for i in range (0, len(fields)):
  percent = (df_reporter[fields[i]].astype(bool).sum(axis=0)/len(df_reporter)) * 100

  df_reporter_field_change_percents.loc[i] = [fields[i], percent]

df_reporter_field_change_percents.sort_values(by='percentage', ascending=False).head(10).reset_index(drop=True)

Unnamed: 0,field,percentage
0,flagtypes.name,34.48
1,cc,28.5
2,status,25.12
3,resolution,21.32
4,assigned_to,12.6
5,blocks,12.34
6,comment_tag,11.46
7,depends_on,7.91
8,summary,6.04
9,attachments.isobsolete,5.23


## 4. Os 10 campos com maior percentual de mudança por parte dos outros usuários

In [7]:

df_others_field_change_percents = pd.DataFrame({
  'field':  [],
  'percentage': []
})

for i in range (0, len(fields)):
  percent = (df_others[fields[i]].astype(bool).sum(axis=0)/len(df_others)) * 100

  df_others_field_change_percents.loc[i] = [fields[i], percent]

df_others_field_change_percents.sort_values(by='percentage', ascending=False).head(10).reset_index(drop=True)

Unnamed: 0,field,percentage
0,status,81.54
1,resolution,81.0
2,cc,68.13
3,flagtypes.name,47.45
4,target_milestone,28.92
5,product,24.68
6,assigned_to,23.35
7,comment_tag,20.48
8,component,19.81
9,whiteboard,14.34
