### <a id='Getting Data Ready2'>1. Getting Data Ready</a>

In [0]:
import pandas as pd
import json
import re

In [0]:
try:
    data = pd.read_csv('E:\yearproject.csv')
    print('File successfully loaded into the DataFrame..')
except:
    print("File not found...")

File successfully loaded into the DataFrame..


In [0]:
data.columns

Index(['action', 'url', 'title', 'body', 'labels'], dtype='object')

In [0]:
data.count()

action    7964
url       7964
title     7964
body      7920
labels    7964
dtype: int64

In [0]:
data.head(3)

Unnamed: 0,action,url,title,body,labels
0,"""closed""","""https://github.com/Microsoft/vcpkg/issues/4548""","""Error: Building package zlib:x64-windows-stat...","""\r\nPackage: zlib:x64-windows-static\r\nVcpkg...","[{""id"":455857191,""node_id"":""MDU6TGFiZWw0NTU4NT..."
1,"""closed""","""https://github.com/primefaces/primeng/issues/...","""Pagination bug when removing an item""","""Reported by a PRO user;\r\n\r\n> If for exemp...","[{""id"":312160661,""node_id"":""MDU6TGFiZWwzMTIxNj..."
2,"""closed""","""https://github.com/zen-kernel/zen-kernel/issu...","""bfq error""","""Hi, Steven/\r\nAfter last update today >git f...",[]


In [0]:
data.shape

(7964, 5)

### <a id='Data Cleaning and Munging'>2. Data Cleaning and Munging</a>

In [0]:
print("Null values in:")
data.isnull().sum()

Null values in:


action     0
url        0
title      0
body      44
labels     0
dtype: int64

In [0]:
data.dropna(inplace=True)

In [0]:
data.shape

(7920, 5)

### 3.2 Dealing with duplicates in the data set

In [0]:
print('The no. of duplicates in the data are',data.duplicated().sum())

The no. of duplicates in the data are 3


In [0]:
data.drop_duplicates(inplace=True)

In [0]:
data.isnull().sum()

action    0
url       0
title     0
body      0
labels    0
dtype: int64

In [0]:
data.shape

(7917, 5)

### <a id='Data Analysis'>2. Data Analysis</a>

In [0]:
closed=data[data['action']=="\"closed\""]
len(closed)

7917

**ALL CLOSED ISSUES**

In [0]:
#Removing url and action columns
data=data[["title","body","labels"]]

In [0]:
labeled=data[data['labels']!='[]']
unlabeled=data[data['labels']=='[]']

In [0]:
labeled.shape

(4063, 3)

In [0]:
unlabeled.shape

(3854, 3)

In [0]:
labeled.head()

Unnamed: 0,title,body,labels
0,"""Error: Building package zlib:x64-windows-stat...","""\r\nPackage: zlib:x64-windows-static\r\nVcpkg...","[{""id"":455857191,""node_id"":""MDU6TGFiZWw0NTU4NT..."
1,"""Pagination bug when removing an item""","""Reported by a PRO user;\r\n\r\n> If for exemp...","[{""id"":312160661,""node_id"":""MDU6TGFiZWwzMTIxNj..."
4,"""Helm podAntiAffinity templating error""","""**Describe the bug**\r\n\r\nHelm charts fail ...","[{""id"":670442116,""node_id"":""MDU6TGFiZWw2NzA0ND..."
5,"""[0.5][Tradingview] Quick Chart resize does no...","""**Describe the bug**\r\nThe up and down arrow...","[{""id"":1089646255,""node_id"":""MDU6TGFiZWwxMDg5N..."
6,"""Upgrade isort""","""We are using 4.3.4 but [isort[(https://pypi.o...","[{""id"":717069763,""node_id"":""MDU6TGFiZWw3MTcwNj..."


In [0]:
json.loads(labeled['labels'][5])

[{'id': 1089646255,
  'node_id': 'MDU6TGFiZWwxMDg5NjQ2MjU1',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[1b]%20User%20Story',
  'name': '[1b] User Story',
  'color': 'b8e4fc',
  'default': False},
 {'id': 272058720,
  'node_id': 'MDU6TGFiZWwyNzIwNTg3MjA=',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[3]%20Bug',
  'name': '[3] Bug',
  'color': 'b60205',
  'default': False},
 {'id': 522016380,
  'node_id': 'MDU6TGFiZWw1MjIwMTYzODA=',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[4c]%20High%20Priority',
  'name': '[4c] High Priority',
  'color': 'fbca04',
  'default': False},
 {'id': 1089668332,
  'node_id': 'MDU6TGFiZWwxMDg5NjY4MzMy',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[5a]%20Tiny',
  'name': '[5a] Tiny',
  'color': 'f9aed0',
  'default': False},
 {'id': 1133046734,
  'node_id': 'MDU6TGFiZWwxMTMzMDQ2NzM0',
  'url': 'https://api.github.com/repos/bitshares/bitshares-ui/labels/[6]%20

In [0]:
#converting string to dict
[x['name'] for x in json.loads(labeled['labels'][5])]

['[1b] User Story',
 '[3] Bug',
 '[4c] High Priority',
 '[5a] Tiny',
 '[6] RC Blockage',
 '[7] Estimated']

In [0]:
labels=[]
for i in labeled['labels']:
    labels.append([x['name'] for x in json.loads(i)])
labeled['labels']=labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [0]:
import string
from nltk.corpus import stopwords  # corpus means collection of writings

In [0]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
print(len(stopwords.words('english')))

179


In [0]:
stopword=stopwords.words('english')
stopword[:5]

['i', 'me', 'my', 'myself', 'we']

In [0]:
newStopWords = ['www','com','r','n','c','e','f','b','g','h','l','http','https','u','v','x','z','de','non','rb','ef']
stopword.extend(newStopWords)

In [0]:
def clean_text(s):
    s=re.sub("[\'\"]","",s) #remove '' and ""
    s=re.sub("\\\\r","",s)  #remove \r
    s=re.sub("\\\\n","",s)  #remove \n
    s=re.sub("[^a-zA-Z]"," ",s) #remove everything but letters
    s=''.join([char for char in s if char not in string.punctuation]) #remove all punctuations
    l=[word.lower() for word in s.split() if not word.lower() in stopword]  #remove stopwords
    return ' '.join(l)

In [0]:
labeled['title'] = labeled['title'].apply(lambda x: clean_text(x))
labeled['body'] = labeled['body'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
labeled.head()

Unnamed: 0,title,body,labels
0,error building package zlib windows static fai...,package zlib windows staticvcpkg version debug,[needs-repro]
1,pagination bug removing item,reported pro user exemple table items per page...,[enhancement]
4,helm podantiaffinity templating error,describe bug helm charts fail generate proper ...,[area/environments]
5,tradingview quick chart resize honor limit values,describe bug arrows tradingview chart size don...,"[[1b] User Story, [3] Bug, [4c] High Priority,..."
6,upgrade isort,using isort pypi org project isort latest migh...,"[feature-refactoring, good first issue, needs ..."


In [0]:
labels=[]
for i in labeled['labels']:
    labels.append(' '.join(i))

In [0]:
labels[:10]

['needs-repro',
 'enhancement',
 'area/environments',
 '[1b] User Story [3] Bug [4c] High Priority [5a] Tiny [6] RC Blockage [7] Estimated',
 'feature-refactoring good first issue needs PR type-code health',
 'feature',
 'Feature:Search Feature:Telemetry P1 a-c',
 'A-partitioning A-sql-syntax C-enhancement',
 'enhancement priority:low',
 'ApiTestUat']

In [0]:
import collections
count={}
for x in labels:
    if x not in count:
        count[x]=1
    else:
        count[x]+=1
d = collections.Counter(count)
freq=[]
i=0
for w,c in d.most_common(2502):
    freq.append([w,c])

In [0]:
freq[:10]

[['bug', 356],
 ['enhancement', 353],
 ['question', 97],
 ['greenkeeper', 45],
 ['wontfix', 37],
 ['feature', 32],
 ['help wanted', 25],
 ['duplicate', 21],
 ['good first issue', 20],
 ['task', 18]]