In [1]:
# libraries
import pandas as pd
import numpy as np
from pymongo import MongoClient

In [2]:
#read data in csv format
git = pd.read_csv('gitData.csv')
issues = pd.read_csv('gitIssues.csv')

In [3]:
# check the shape of the data
print(git.shape)
print(issues.shape)

(60996, 30)
(32329, 22)


In [4]:
#data inspection
print(git.info())
print(issues.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60996 entries, 0 to 60995
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          60996 non-null  int64  
 1   hash                60996 non-null  object 
 2   msg                 60996 non-null  object 
 3   author_name         60996 non-null  object 
 4   committer_name      60996 non-null  object 
 5   author_date         60996 non-null  object 
 6   author_timezone     60996 non-null  int64  
 7   committer_date      60996 non-null  object 
 8   committer_timezone  60996 non-null  int64  
 9   branches            60996 non-null  object 
 10  in_main_branch      60996 non-null  bool   
 11  merge               60996 non-null  bool   
 12  parents             60996 non-null  object 
 13  project_name        60996 non-null  object 
 14  deletions           60996 non-null  int64  
 15  insertions          60996 non-null  int64  
 16  line

In [5]:
# let's connect to the localhost
client = MongoClient()

# let's create a database 
db = client.github

# collection for git issues
gitIssues = db.gitIssues

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, gitIssues), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'gitIssues')



In [7]:
%%time 
##
# slow loading of data
d = {}
# pass data 
for i in issues.index:
    d = {
        "Questions": {
            "Title": issues.loc[i,"title"],
            "Body": issues.loc[i,"body"]
        },
        "User" : {
            "Username" : issues.loc[i, "user"],
            "User_ID" :  issues.loc[i,"user_id"]
        },
        "State" : {
            "State" : issues.loc[i, "state"],
            "Created_at" :  issues.loc[i,"created_at"],
            "Updated_at" :  issues.loc[i,"updated_at"],
            "Closed_at" : issues.loc[i, "closed_at"]
        },
        "Assignees" : issues.loc[i, "assignees"],
        "Closed_by" : issues.loc[i, "closed_by"],
        "Labels" : issues.loc[i, "labels"],
        "Reactions" : issues.loc[i, "reactions"],
        "N_comments" : issues.loc[i, "n_comments"].astype(str),
        "Projects" : issues.loc[i, "project"]
    }
    gitIssues.insert_one(d)

CPU times: user 11.5 s, sys: 796 ms, total: 12.3 s
Wall time: 16.9 s


In [8]:
# new collection for issue comment
IssueComment = db.IssueComment

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, IssueComment), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'IssueComment')



In [9]:
%%time 
##
# slow loading of data
d = {}
# pass data 
for i in issues.index:
    d = {
        "Title": issues.loc[i,"title"],
        "Comment_created_at": issues.loc[i,"comment_created_at"],
        "Labels" : issues.loc[i, "labels"],
        "Reactions" : issues.loc[i, "reactions"],
        "N_comments" : issues.loc[i, "n_comments"].astype(str),
        "Projects" : issues.loc[i, "project"]
    }
    IssueComment.insert_one(d)

CPU times: user 8.84 s, sys: 620 ms, total: 9.46 s
Wall time: 13 s


In [10]:
# new collection for git data
gitData = db.gitData

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, gitData), flush=True
)


Database
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github')

Collection
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'github'), 'gitData')



In [11]:
# import the git data to mongo compass
#%%time 
##
# slow loading of data
d = {}
#pass data
for i in git.index:
    d = {
        "Commit": {
            "hash":git.loc[i, "hash"],
            "Msg":{
                "msg":git.loc[i, 'msg'],
                "Author":{
                    "Author_name": git.loc[i, 'author_name'],
                    "Author_date": git.loc[i, 'author_date'],
                    "Author_timezone": git.loc[i, 'author_timezone'].astype(str)
                }
            },
            "Committer":{
                "Committer_name": git.loc[i, 'committer_name'],
                "Committer_date": git.loc[i, 'committer_date'],
                "Committer_timezone": git.loc[i, 'committer_timezone'].astype(str)
            }
        },
        "Branch":{
            "Branches": git.loc[i, 'branches'],
            "In_main_branch": git.loc[i, 'in_main_branch'].astype(str),
            "Merge": git.loc[i, 'merge'].astype(str),
            "Parents":git.loc[i, 'parents']
        },
        "Project_name": git.loc[i, 'project_name'],
        "File":{
            "Filename": git.loc[i, 'filename'],
            "Change_type": git.loc[i, 'change_type'],
            "Commit_change":{
                "Deletions": git.loc[i, 'deletions'].astype(str),
                "Insertions": git.loc[i, 'insertions'].astype(str),
                "Files": git.loc[i, 'files'].astype(str),
                "Lines":git.loc[i, 'lines'].astype(str)
            }
        },
        "Code_change":{
            "Path":{
                "old_path":git.loc[i, 'old_path'],
                "new_path": git.loc[i, 'new_path']
            },
            "Diff":{
                "Diff": git.loc[i, 'diff'],
                "Diff_parse":{
                    "Diff_parsed":git.loc[i, 'diff_parsed'],
                    "Deleted_lines":git.loc[i, 'deleted_lines'].astype(str)
                }
            },
            "Source_code":{
                "Source_code":git.loc[i, 'source_code'],
                "Source_code_before":git.loc[i, 'source_code_before']
            }
        },
        "Nloc":git.loc[i, 'nloc'],
        "Complexity":git.loc[i, 'complexity'],
        "Token_count":git.loc[i, 'token_count']
        
    }
    gitData.insert_one(d)