### <a id='Getting Data Ready2'>1. Getting Data Ready</a>

In [1]:
import pandas as pd
import json
import re

In [2]:
try:
    data = pd.read_csv('E:\yearproject.csv')
    print('File successfully loaded into the DataFrame..')
except:
    print("File not found...")

File successfully loaded into the DataFrame..


In [3]:
data.columns

Index(['action', 'url', 'title', 'body', 'labels'], dtype='object')

In [4]:
data.count()

action    7964
url       7964
title     7964
body      7920
labels    7964
dtype: int64

In [5]:
data.head(3)

Unnamed: 0,action,url,title,body,labels
0,"""closed""","""https://github.com/Microsoft/vcpkg/issues/4548""","""Error: Building package zlib:x64-windows-stat...","""\r\nPackage: zlib:x64-windows-static\r\nVcpkg...","[{""id"":455857191,""node_id"":""MDU6TGFiZWw0NTU4NT..."
1,"""closed""","""https://github.com/primefaces/primeng/issues/...","""Pagination bug when removing an item""","""Reported by a PRO user;\r\n\r\n> If for exemp...","[{""id"":312160661,""node_id"":""MDU6TGFiZWwzMTIxNj..."
2,"""closed""","""https://github.com/zen-kernel/zen-kernel/issu...","""bfq error""","""Hi, Steven/\r\nAfter last update today >git f...",[]


In [6]:
data.shape

(7964, 5)

### <a id='Data Cleaning and Munging'>2. Data Cleaning and Munging</a>

In [7]:
print("Null values in:")
data.isnull().sum()

Null values in:


action     0
url        0
title      0
body      44
labels     0
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:
data.shape

(7920, 5)

### 3.2 Dealing with duplicates in the data set

In [10]:
print('The no. of duplicates in the data are',data.duplicated().sum())

The no. of duplicates in the data are 3


In [11]:
data.drop_duplicates(inplace=True)

In [12]:
data.isnull().sum()

action    0
url       0
title     0
body      0
labels    0
dtype: int64

In [13]:
data.shape

(7917, 5)

### <a id='Data Analysis'>2. Data Analysis</a>

In [14]:
closed=data[data['action']=="\"closed\""]
len(closed)

7917

**ALL CLOSED ISSUES**

In [15]:
#Removing url and action columns
data=data[["title","body","labels"]]

In [16]:
labeled=data[data['labels']!='[]']
unlabeled=data[data['labels']=='[]']

In [17]:
labeled.shape

(4063, 3)

In [18]:
unlabeled.shape

(3854, 3)

In [19]:
labeled.head()

Unnamed: 0,title,body,labels
0,"""Error: Building package zlib:x64-windows-stat...","""\r\nPackage: zlib:x64-windows-static\r\nVcpkg...","[{""id"":455857191,""node_id"":""MDU6TGFiZWw0NTU4NT..."
1,"""Pagination bug when removing an item""","""Reported by a PRO user;\r\n\r\n> If for exemp...","[{""id"":312160661,""node_id"":""MDU6TGFiZWwzMTIxNj..."
4,"""Helm podAntiAffinity templating error""","""**Describe the bug**\r\n\r\nHelm charts fail ...","[{""id"":670442116,""node_id"":""MDU6TGFiZWw2NzA0ND..."
5,"""[0.5][Tradingview] Quick Chart resize does no...","""**Describe the bug**\r\nThe up and down arrow...","[{""id"":1089646255,""node_id"":""MDU6TGFiZWwxMDg5N..."
6,"""Upgrade isort""","""We are using 4.3.4 but [isort[(https://pypi.o...","[{""id"":717069763,""node_id"":""MDU6TGFiZWw3MTcwNj..."


In [20]:
json.loads(labeled['labels'][5])

[{'id': 1089646255,
  'node_id': 'MDU6TGFiZWwxMDg5NjQ2MjU1',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[1b]%20User%20Story',
  'name': '[1b] User Story',
  'color': 'b8e4fc',
  'default': False},
 {'id': 272058720,
  'node_id': 'MDU6TGFiZWwyNzIwNTg3MjA=',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[3]%20Bug',
  'name': '[3] Bug',
  'color': 'b60205',
  'default': False},
 {'id': 522016380,
  'node_id': 'MDU6TGFiZWw1MjIwMTYzODA=',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[4c]%20High%20Priority',
  'name': '[4c] High Priority',
  'color': 'fbca04',
  'default': False},
 {'id': 1089668332,
  'node_id': 'MDU6TGFiZWwxMDg5NjY4MzMy',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[5a]%20Tiny',
  'name': '[5a] Tiny',
  'color': 'f9aed0',
  'default': False},
 {'id': 1133046734,
  'node_id': 'MDU6TGFiZWwxMTMzMDQ2NzM0',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[6]%20

In [21]:
#converting string to dict
[x['name'] for x in json.loads(labeled['labels'][5])]

['[1b] User Story',
 '[3] Bug',
 '[4c] High Priority',
 '[5a] Tiny',
 '[6] RC Blockage',
 '[7] Estimated']

In [22]:
labels=[]
for i in labeled['labels']:
    labels.append([x['name'] for x in json.loads(i)])
labeled['labels']=labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [23]:
import string
from nltk.corpus import stopwords  # corpus means collection of writings

In [24]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
print(len(stopwords.words('english')))

179


In [26]:
stopword=stopwords.words('english')
stopword[:5]

['i', 'me', 'my', 'myself', 'we']

In [27]:
newStopWords = ['www','com','r','n','c','e','f','b','g','h','l','http','https','u','v','x','z','de','non','rb','ef']
stopword.extend(newStopWords)

In [28]:
def clean_text(s):
    s=re.sub("[\'\"]","",s) #remove '' and ""
    s=re.sub("\\\\r","",s)  #remove \r
    s=re.sub("\\\\n","",s)  #remove \n
    s=re.sub("[^a-zA-Z]"," ",s) #remove everything but letters
    s=''.join([char for char in s if char not in string.punctuation]) #remove all punctuations
    l=[word.lower() for word in s.split() if not word.lower() in stopword]  #remove stopwords
    return ' '.join(l)

In [29]:
labeled['title'] = labeled['title'].apply(lambda x: clean_text(x))
labeled['body'] = labeled['body'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
labeled.head()

Unnamed: 0,title,body,labels
0,error building package zlib windows static fai...,package zlib windows staticvcpkg version debug,[needs-repro]
1,pagination bug removing item,reported pro user exemple table items per page...,[enhancement]
4,helm podantiaffinity templating error,describe bug helm charts fail generate proper ...,[area/environments]
5,tradingview quick chart resize honor limit values,describe bug arrows tradingview chart size don...,"[[1b] User Story, [3] Bug, [4c] High Priority,..."
6,upgrade isort,using isort pypi org project isort latest migh...,"[feature-refactoring, good first issue, needs ..."


In [31]:
def clean_label(l):
    s=','.join(l)
    s=re.sub("[\[\\]]",'',s)
    s=re.sub("['0123456789']",'',s)
    s=''.join([char.lower() for char in s])
    st=[]
    for com in s.split(','):
        l=[]
        for wo in com.split():
            if not wo in ['r','c','b','rc','a','l','x','z',':','p:']:
                l.append(wo)
        st.append(' '.join(l))
    return ','.join(st) 

In [32]:
labeled['try']=labeled['labels'].apply(lambda x: clean_label(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
labels=[]
for i in labeled['try']:
    for com in i.split(','):
        labels.append(''.join(com))

In [34]:
import collections
count=dict()
for x in labels:
    if x not in count:
        count[x]=1
    else:
        count[x]+=1
d = collections.Counter(count)

In [35]:
i=0
for w,c in d.most_common(2767):
    i+=1
print("unique labels =",i)

unique labels = 2767


In [36]:
freq=dict()
for w,c in d.most_common(2767):
    freq[w]=c

In [37]:
freq

{'bug': 731,
 'enhancement': 606,
 'question': 157,
 'help wanted': 106,
 'good first issue': 89,
 'feature': 80,
 'wontfix': 75,
 'type: bug': 50,
 'stale': 50,
 'p': 48,
 'greenkeeper': 47,
 'in progress': 43,
 'triaged': 42,
 'documentation': 38,
 'task': 36,
 'medium': 36,
 'duplicate': 35,
 'invalid': 30,
 'kind/bug': 29,
 'feature request': 26,
 'docs': 25,
 'user-submission': 25,
 'high priority': 23,
 'priority: high': 23,
 'ui': 23,
 'assigned-to-author': 23,
 'type: enhancement': 22,
 'priority: medium': 22,
 'review': 21,
 '': 21,
 'design': 19,
 'high': 19,
 'n: adguard browser extension': 18,
 'frontend': 18,
 'epic': 18,
 'support': 17,
 'fixed': 17,
 'browser-firefox': 17,
 'backend': 15,
 'priority/p': 15,
 'new-version': 14,
 'cxp': 14,
 'triage': 14,
 'critical': 14,
 'browser-firefox-mobile': 14,
 'discussion': 13,
 'sprint': 13,
 'version/.': 13,
 '.': 13,
 'doc-enhancement': 12,
 'approved': 12,
 'eps': 11,
 'in scope (june )': 11,
 'backlog': 11,
 'target/..': 11,

In [38]:
freq['bug']+=freq.pop('type: bug')
freq['bug']+=freq.pop('kind/bug')
freq['bug']+=freq.pop('type_bug')
freq['bug']+=freq.pop('type:bug')
freq['bug']+=freq.pop('type bug')
freq['bug']+=freq.pop('type.bug')
freq['bug']+=freq.pop('type-bug')
freq['enhancement']+=freq.pop('type: enhancement')
freq['enhancement']+=freq.pop('type:enhancement')
freq['enhancement']+=freq.pop('kind/enhancement')
freq['feature']+=freq.pop('feature request')
freq['feature']+=freq.pop('new feature')
freq['feature']+=freq.pop('type: feature')
freq['documentation']+=freq.pop('docs')
freq['high priority']+=freq.pop('priority: high')
freq['high priority']+=freq.pop('priority - high')
freq['high priority']+=freq.pop('priority/high')
freq['high priority']+=freq.pop('priority.high')
freq['priority: medium']+=freq.pop('priority.medium')
freq['priority: medium']+=freq.pop('medium priority')
freq['question']+=freq.pop('type: question')
freq['question']+=freq.pop('kind/question')
freq['enhancement']+=freq.pop('improvement')
freq['support']+=freq.pop('type:support')
freq['in progress']+=freq.pop('in-progress')
freq['good first issue']+=freq.pop('good-first-issue')

In [39]:
freq

{'bug': 841,
 'enhancement': 646,
 'question': 173,
 'help wanted': 106,
 'good first issue': 93,
 'feature': 122,
 'wontfix': 75,
 'stale': 50,
 'p': 48,
 'greenkeeper': 47,
 'in progress': 51,
 'triaged': 42,
 'documentation': 63,
 'task': 36,
 'medium': 36,
 'duplicate': 35,
 'invalid': 30,
 'user-submission': 25,
 'high priority': 64,
 'ui': 23,
 'assigned-to-author': 23,
 'priority: medium': 39,
 'review': 21,
 '': 21,
 'design': 19,
 'high': 19,
 'n: adguard browser extension': 18,
 'frontend': 18,
 'epic': 18,
 'support': 26,
 'fixed': 17,
 'browser-firefox': 17,
 'backend': 15,
 'priority/p': 15,
 'new-version': 14,
 'cxp': 14,
 'triage': 14,
 'critical': 14,
 'browser-firefox-mobile': 14,
 'discussion': 13,
 'sprint': 13,
 'version/.': 13,
 '.': 13,
 'doc-enhancement': 12,
 'approved': 12,
 'eps': 11,
 'in scope (june )': 11,
 'backlog': 11,
 'target/..': 11,
 'lang/zh': 11,
 'priority: low': 11,
 'resolved': 11,
 'kind/feature': 11,
 'todo': 11,
 'a: resolved': 11,
 'gitalk':

In [40]:
len(freq)

2741