In [1]:
import re
# import nltk
import textstat
import pandas as pd
import numpy as np
import datetime
import statsmodels.api as sm
from collections import Counter
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
import pymongo
from sklearn import model_selection
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, KFold, HalvingGridSearchCV

import xgboost

# nltk.download("wordnet")
# nltk.download("omw-1.4")

In [2]:
def connect_mongo(query={},host='localhost', port=27017, username=None, password=None, db='test'):
    if username and password:
        mongo_uri = "mongodb://%s:%s@%s:%s/%s" % (username, password, host, port, db)
        client = pymongo.MongoClient(mongo_uri)
    else:
        client = pymongo.MongoClient(host, port)
    return client
    
client = connect_mongo()
collect = client.issues.first_issues
tmp_collect = client.gfibot.resolved_issue
# db = collect.find({"resolver_commit_num":{"$gte": 0, "$lte": 2}}).sort("closed_at",pymongo.ASCENDING)

In [3]:
df = pd.DataFrame(list(collect.find()))

In [4]:
otc = set()
notc = set()
cnt = 0
for data in collect.find():
    if tmp_collect.count_documents({"owner":data["owner"],"name":data["name"],"resolver":data["resolver"]}) <= 1:
#         print(data["owner"], data["name"], data["resolver"],tmp_collect.count_documents({"owner":data["owner"],"name":data["name"],"resolver":data["resolver"]}))
        otc.add(data["owner"]+"/"+data["name"]+"/"+data["resolver"])
    else:
        res = tmp_collect.find({"owner":data["owner"],"name":data["name"],"resolver":data["resolver"]}).sort([("resolved_at",-1)]).limit(1)
        for item in res:
            if (item["resolved_at"]-data["closed_at"]).days<7:
                otc.add(item["owner"]+"/"+item["name"]+"/"+item["resolver"])
            else:
                notc.add(item["owner"]+"/"+item["name"]+"/"+item["resolver"])

In [5]:
len(otc),len(notc)

(8779, 3078)

In [6]:
otc_iss, notc_iss = df[(df.owner+"/"+df.name+"/"+df.resolver).isin(otc)].copy(), df[~(df.owner+"/"+df.name+"/"+df.resolver).isin(otc)].copy()

In [7]:
len(otc_iss), len(notc_iss)

(11308, 3633)

In [8]:
otc_iss['label'] = 0
notc_iss['label'] = 1

In [9]:
iss_dataset = pd.concat([otc_iss,notc_iss])

In [10]:
metrics = [
    "len_body", "n_urls",
    "flesch_reading_ease", "flesch_kincaid_grade",
    "n_stars", "n_commits", "n_contributors", "n_closed_issues", "n_open_issues", "r_open_issues"
]

In [11]:
dataset = iss_dataset[metrics].copy()

In [12]:
def language_n(language:str)->int:
    if language == 'JavaScript':
        return 1
    if language == 'Python':
        return 2
    if language == 'TypeScript':
        return 3
#     if language == 'C++':
#         return 4
#     if language == 'C#':
#         return 5
#     if language == 'PHP':
#         return 6
#     if language == 'java':
#         return 7
#     if language == 'HTML':
#         return 8
    return 0
dataset['language_n'] = iss_dataset.language.map(language_n)

In [13]:
def changed_lines(n:int)->int:
    if n <= 10:
        return 1
    if n <= 100:
        return 2
    if n <= 500:
        return 3
    return 5
dataset['changed_lines'] = (iss_dataset['additions']+iss_dataset['deletions'])/2
dataset['changed_lines'] = dataset.changed_lines.map(changed_lines)


def iss_type(row)-> int:
    if row["label_category"]["bug"] >= 1:
        return 1
    if "bug" in row["title"].lower() or "fail" in row["title"].lower() or "error" in row["title"].lower() or "wrong" in row["title"].lower():
        return 1
    
    if row["label_category"]["feature"] >= 1 or row["label_category"]["enhance"] >= 1:
        return 2
    if "feat" in row["title"].lower() or "add" in row["title"].lower() or "new" in row["title"].lower():
        return 2
    return 0
dataset['type'] = iss_dataset.apply(iss_type, axis=1)

In [14]:
dataset['n_comment'] = iss_dataset.comments.map(len)
dataset['n_label'] = iss_dataset.labels.map(len)

dataset['reporter_commits'] = iss_dataset.reporter_feat.map(lambda x:x['n_commits'])
dataset['reporter_issues'] = iss_dataset.reporter_feat.map(lambda x:x['n_issues'])
dataset['reporter_pulls'] = iss_dataset.reporter_feat.map(lambda x:x['n_pulls'])

In [15]:
dataset['label'] = iss_dataset['label']

In [16]:
dataset

Unnamed: 0,len_body,n_urls,flesch_reading_ease,flesch_kincaid_grade,n_stars,n_commits,n_contributors,n_closed_issues,n_open_issues,r_open_issues,language_n,changed_lines,type,n_comment,n_label,reporter_commits,reporter_issues,reporter_pulls,label
0,118,2,46.37,10.9,145,681,24,20,102,0.836066,2,2,2,0,2,217,96,6,0
2,118,2,46.37,10.9,145,681,29,85,45,0.346154,2,2,2,0,3,284,100,9,0
3,11,0,68.77,6.4,282,355,61,27,24,0.470588,0,2,0,9,2,209,33,62,0
4,118,2,46.37,10.9,145,681,30,90,41,0.312977,2,2,2,0,3,297,101,9,0
5,48,1,31.89,12.3,1040,1003,235,857,214,0.199813,0,2,0,22,2,42,3,22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14919,29,0,89.45,2.6,40019,41974,686,52150,3829,0.068401,3,2,1,6,2,5583,2010,209,1
14921,106,0,70.84,7.7,40019,43167,755,54564,4349,0.073821,3,1,1,2,2,0,51,0,1
14923,20,0,51.18,11.1,40019,41974,721,53260,4037,0.070457,3,1,1,12,4,57,84,84,1
14929,239,2,67.04,7.1,40019,43167,758,54780,4438,0.074943,3,2,0,9,3,2,5,1,1


In [34]:
# 将用于训练模型的数据集存入mongodb
import json
traindata_collect = client.issues.train_data
tmp_data = json.loads(dataset.T.to_json()).values()
res = traindata_collect.insert_many(tmp_data)