In [1]:
# libraries
import pandas as pd
import numpy as np
from pymongo import MongoClient

In [4]:
#read data in csv format
git = pd.read_csv('gitData.csv')
issues = pd.read_csv('gitIssues.csv')

In [3]:
# check the shape of the data
print(git.shape)
print(issues.shape)

(60996, 30)
(32329, 22)


In [4]:
#data inspection
print(git.info())
print(issues.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60996 entries, 0 to 60995
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          60996 non-null  int64  
 1   hash                60996 non-null  object 
 2   msg                 60996 non-null  object 
 3   author_name         60996 non-null  object 
 4   committer_name      60996 non-null  object 
 5   author_date         60996 non-null  object 
 6   author_timezone     60996 non-null  int64  
 7   committer_date      60996 non-null  object 
 8   committer_timezone  60996 non-null  int64  
 9   branches            60996 non-null  object 
 10  in_main_branch      60996 non-null  bool   
 11  merge               60996 non-null  bool   
 12  parents             60996 non-null  object 
 13  project_name        60996 non-null  object 
 14  deletions           60996 non-null  int64  
 15  insertions          60996 non-null  int64  
 16  line

In [5]:
# let's connect to the localhost
client = MongoClient()

# let's create a database 
db = client.github

# collection for git issues
gitIssues = db.gitIssues

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, gitIssues), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'gitIssues')



In [5]:
%%time 
##
# slow loading of data
d_gitissue = {}
# pass data 
for i in issues.index:
    d_gitissue = {
        "Questions": {
            "Title": issues.loc[i,"title"],
            "Body": issues.loc[i,"body"]
        },
        "User" : {
            "Username" : issues.loc[i, "user"],
            "User_ID" :  issues.loc[i,"user_id"]
        },
        "State" : {
            "State" : issues.loc[i, "state"],
            "Created_at" :  issues.loc[i,"created_at"],
            "Updated_at" :  issues.loc[i,"updated_at"],
            "Closed_at" : issues.loc[i, "closed_at"]
        },
        "Assignees" : issues.loc[i, "assignees"],
        "Closed_by" : issues.loc[i, "closed_by"],
        "Labels" : issues.loc[i, "labels"],
        "Reactions" : issues.loc[i, "reactions"],
        "N_comments" : issues.loc[i, "n_comments"].astype(str),
        "Projects" : issues.loc[i, "project"]
    }
    if issues.loc[i,"title"] == issues.loc[i-1,"title"]:
        continue
    else:
        gitIssues.insert_one(d_gitissue)

CPU times: user 1.4 ms, sys: 211 µs, total: 1.61 ms
Wall time: 3.31 ms


In [8]:
# new collection for issue comment
IssueComment = db.IssueComment

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, IssueComment), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'IssueComment')



In [6]:
%%time 
##
# slow loading of data
d_comment = {}
# pass data 
for i in issues.index:
    d_comment = {
        "Title": issues.loc[i,"title"],
        "Comment_created_at": issues.loc[i,"comment_created_at"],
        "Labels" : issues.loc[i, "labels"],
        "Reactions" : issues.loc[i, "reactions"],
        "N_comments" : issues.loc[i, "n_comments"].astype(str),
        "Projects" : issues.loc[i, "project"]
    }
    IssueComment.insert_one(d_comment)

CPU times: user 327 µs, sys: 49 µs, total: 376 µs
Wall time: 385 µs


In [4]:
# let's connect to the localhost
client = MongoClient()

# let's create a database 
db = client.github

In [5]:
# new collection for git data
gitData = db.gitData_new

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, gitData), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'gitData_new')



In [6]:
# import the git data to mongo compass
#%%time 
##
# slow loading of data
d_gitcommit = {}
#pass data
for i in git.index:
    d_gitcommit = {
        "Commit": {
            "hash":git.loc[i, "hash"],
            "Msg":{
                "msg":git.loc[i, 'msg'],
                "Author":{
                    "Author_name": git.loc[i, 'author_name'],
                    "Author_date": git.loc[i, 'author_date'],
                    "Author_timezone": git.loc[i, 'author_timezone'].astype(str)
                }
            },
            "Committer":{
                "Committer_name": git.loc[i, 'committer_name'],
                "Committer_date": git.loc[i, 'committer_date'],
                "Committer_timezone": git.loc[i, 'committer_timezone'].astype(str)
            }
        },
        "Branch":{
            "Branches": git.loc[i, 'branches'],
            "In_main_branch": git.loc[i, 'in_main_branch'].astype(str),
            "Merge": git.loc[i, 'merge'].astype(str),
            "Parents":git.loc[i, 'parents']
        },
        "Project_name": git.loc[i, 'project_name'],
        "File":{
            "Filename": git.loc[i, 'filename'],
            "Change_type": git.loc[i, 'change_type'],
            "Commit_change":{
                "Deletions": git.loc[i, 'deletions'].astype(str),
                "Insertions": git.loc[i, 'insertions'].astype(str),
                "Files": git.loc[i, 'files'].astype(str),
                "Lines":git.loc[i, 'lines'].astype(str)
            }
        },
        "Code_change":{
            "Path":{
                "old_path":git.loc[i, 'old_path'],
                "new_path": git.loc[i, 'new_path']
            },
            "Diff":{
                "Diff": git.loc[i, 'diff'],
                "Diff_parse":{
                    "Diff_parsed":git.loc[i, 'diff_parsed'],
                    "Deleted_lines":git.loc[i, 'deleted_lines'].astype(str)
                }
            },
            "Source_code":{
                "Source_code":git.loc[i, 'source_code'],
                "Source_code_before":git.loc[i, 'source_code_before']
            }
        },
        "Nloc":git.loc[i, 'nloc'],
        "Complexity":git.loc[i, 'complexity'],
        "Token_count":git.loc[i, 'token_count']
        
     }
    gitData.insert_one(d_gitcommit)

NameError: name 'git' is not defined

In [2]:
# let's connect to the localhost
client = MongoClient()
# link to the database
db = client.github

# link to the collection
gitData = db.gitData
gitIssues = db.gitIssues
IssueComment = db.IssueComment


In [8]:
# data cleaning
# clean the NA values into null
# get key names
key_list_issue = []
for i in d_gitissue.keys():
    try:
        for b in d_gitissue.get(str(i)).keys():
            key_list_issue.append(str(i) + '.' + str(b))
    except:
        key_list_issue.append(i)

In [10]:
# get key names
key_list_comment = []
for i in d_comment.keys():
    try:
        for b in d_comment.get(str(i)).keys():
            key_list_comment.append(str(i) + '.' + str(b))
    except:
        key_list_comment.append(i)

In [11]:
# get key names
key_list_commit = []
for i in d_gitcommit.keys():
    try:
        for b in d_gitcommit.get(str(i)).keys():
            key_list_commit.append(str(i) + '.' + str(b))
    except:
        key_list_commit.append(i)

In [13]:
# unset NaN fields
for i in key_list_issue:
    update = gitIssues.update_many({str(i):np.nan},{"$unset": {str(i):""}})
    print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format(i, update.matched_count, update.modified_count), flush=True)


    Key: Questions.Title
    Matched: 0
    Modified: 0
    ------------
    

    Key: Questions.Body
    Matched: 461
    Modified: 461
    ------------
    

    Key: User.Username
    Matched: 0
    Modified: 0
    ------------
    

    Key: User.User_ID
    Matched: 0
    Modified: 0
    ------------
    

    Key: State.State
    Matched: 0
    Modified: 0
    ------------
    

    Key: State.Created_at
    Matched: 0
    Modified: 0
    ------------
    

    Key: State.Updated_at
    Matched: 0
    Modified: 0
    ------------
    

    Key: State.Closed_at
    Matched: 0
    Modified: 0
    ------------
    

    Key: Assignees
    Matched: 0
    Modified: 0
    ------------
    

    Key: Closed_by
    Matched: 19088
    Modified: 19088
    ------------
    

    Key: Labels
    Matched: 0
    Modified: 0
    ------------
    

    Key: Reactions
    Matched: 0
    Modified: 0
    ------------
    

    Key: N_comments
    Matched: 0
    Modified: 0
    ------------
    



In [14]:
# unset NaN fields
for i in key_list_comment:
    update = IssueComment.update_many({str(i):np.nan},{"$unset": {str(i):""}})
    print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format(i, update.matched_count, update.modified_count), flush=True)


    Key: Title
    Matched: 0
    Modified: 0
    ------------
    

    Key: Comment_created_at
    Matched: 0
    Modified: 0
    ------------
    

    Key: Labels
    Matched: 0
    Modified: 0
    ------------
    

    Key: Reactions
    Matched: 0
    Modified: 0
    ------------
    

    Key: N_comments
    Matched: 0
    Modified: 0
    ------------
    

    Key: Projects
    Matched: 0
    Modified: 0
    ------------
    


In [None]:
# unset NaN fields
for i in key_list_commit:
    update = gitData.update_many({str(i):np.nan},{"$unset": {str(i):""}})
    print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format(i, update.matched_count, update.modified_count), flush=True)

In [3]:
# update the date date from string to datetime
# change the data type of 'Created_at', 'Updated_at', and 'closed_at' in gitissues
update = gitIssues.update_many(
    {},
    [{"$set": {
        "State.Created_at": {
            "$toDate":"$State.Created_at" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Created_at", update.matched_count, update.modified_count), flush=True)


    Key: Created_at
    Matched: 5513
    Modified: 5513
    ------------
    


In [4]:
# change the data type of 'Created_at', 'Updated_at', and 'closed_at' in gitissues
update = gitIssues.update_many(
    {},
    [{"$set": {
        "State.Updated_at": {
            "$toDate":"$State.Updated_at" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Updated_at", update.matched_count, update.modified_count), flush=True)


    Key: Updated_at
    Matched: 5513
    Modified: 5513
    ------------
    


In [5]:
# change the data type of 'Created_at', 'Updated_at', and 'closed_at' in gitissues
update = gitIssues.update_many(
    {},
    [{"$set": {
        "State.Closed_at": {
            "$toDate":"$State.Closed_at" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Closed_at", update.matched_count, update.modified_count), flush=True)


    Key: Closed_at
    Matched: 5513
    Modified: 5513
    ------------
    


In [6]:
# change the data type of 'Author_date', 'Committer_date' in gitData
update = gitData.update_many(
    {},
    [{"$set": {
        "Commit.Msg.Author.Author_date": {
            "$toDate":"$Commit.Msg.Author.Author_date" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Commit.Msg.Author.Author_date", update.matched_count, update.modified_count), flush=True)


    Key: Commit.Msg.Author.Author_date
    Matched: 60996
    Modified: 60996
    ------------
    


In [7]:
# change the data type of 'Author_date', 'Committer_date' in gitData
update = gitData.update_many(
    {},
    [{"$set": {
        "Commit.Committer.Committer_date": {
            "$toDate":"$Commit.Committer.Committer_date" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Commit.Committer.Committer_date", update.matched_count, update.modified_count), flush=True)


    Key: Commit.Committer.Committer_date
    Matched: 60996
    Modified: 60996
    ------------
    


In [8]:
# change the data type of 'Comment_created_at' in issueComment
update = IssueComment.update_many(
    {},
    [{"$set": {
        "Comment_created_at": {
            "$toDate":"$Comment_created_at" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Comment_created_at", update.matched_count, update.modified_count), flush=True)


    Key: Comment_created_at
    Matched: 32329
    Modified: 32329
    ------------
    


In [9]:
# update numeric data from string to int
# change the data type of "N_comments" in gitIssues into integer
update = gitIssues.update_many(
    {},
    [{"$set": {
        "N_comments": {
            "$toInt":"$N_comments" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("N_comments", update.matched_count, update.modified_count), flush=True)


    Key: N_comments
    Matched: 5513
    Modified: 5513
    ------------
    


In [10]:
# change the data type of "Author_timezone" in gitData into integer
update = IssueComment.update_many(
    {},
    [{"$set": {
        "N_comments": {
            "$toInt":"$N_comments" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Author_timezone", update.matched_count, update.modified_count), flush=True)


    Key: Author_timezone
    Matched: 32329
    Modified: 32329
    ------------
    


In [11]:
# change the data type of "Author_timezone" in gitData into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "Commit.Msg.Author.Author_timezone": {
            "$toInt":"$Commit.Msg.Author.Author_timezone" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Author_timezone", update.matched_count, update.modified_count), flush=True)


    Key: Author_timezone
    Matched: 60996
    Modified: 60996
    ------------
    


In [12]:
# change the data type of "Author_timezone" in gitData into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "Commit.Committer.Committer_timezone": {
            "$toInt":"$Commit.Committer.Committer_timezone" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Commit.Committer.Committer_timezone", update.matched_count, update.modified_count), flush=True)


    Key: Author_timezone
    Matched: 60996
    Modified: 60996
    ------------
    


In [13]:
# change the data type of "Deletions" in gitdata into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "File.Commit_change.Deletions": {
            "$toInt":"$File.Commit_change.Deletions" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Deletions", update.matched_count, update.modified_count), flush=True)


    Key: Deletions
    Matched: 60996
    Modified: 60996
    ------------
    


In [14]:
# change the data type of "Insertions" in gitdata into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "File.Commit_change.Insertions": {
            "$toInt":"$File.Commit_change.Insertions" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Insertions", update.matched_count, update.modified_count), flush=True)


    Key: Insertions
    Matched: 60996
    Modified: 60996
    ------------
    


In [15]:
# change the data type of "Files" in gitdata into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "File.Commit_change.Files": {
            "$toInt":"$File.Commit_change.Files" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Files", update.matched_count, update.modified_count), flush=True)


    Key: Files
    Matched: 60996
    Modified: 60996
    ------------
    


In [16]:
# change the data type of "Lines" in gitdata into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "File.Commit_change.Lines": {
            "$toInt":"$File.Commit_change.Lines" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Lines", update.matched_count, update.modified_count), flush=True)


    Key: Lines
    Matched: 60996
    Modified: 60996
    ------------
    


In [17]:
# change the data type of "Deleted Lines" in gitdata into integer
update = gitData.update_many(
    {},
    [{"$set": {
        "Code_change.Diff.Diff_parse.Deleted_lines": {
            "$toInt":"$Code_change.Diff.Diff_parse.Deleted_lines" } 
        }
    }
]
)

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Deleted Lines", update.matched_count, update.modified_count), flush=True)


    Key: Deleted Lines
    Matched: 60996
    Modified: 60996
    ------------
    
