# md2json

In [1]:
import re
import copy
import json
import os

In [2]:
def repo_process(repo):
    res = {}
    name = re.search(r'\[.*?\]', repo).group()[1:-1]
    res["name"] = name
    url = re.search(r'\(.*?\)', repo).group()[1:-1]
    res["url"] = url
    if re.search(r'\(\*\*.*?\*\*\)', repo).group()[3:-3]:
        star = int(re.search(r'\(\*\*.*?\*\*\)', repo).group()[3:-3].split(" stars today")[0].replace(",",""))
        res["star"] = star
    else:
        res["star"] = None
    description = re.search(r'\*\*\).*?\n', repo).group()[5:-1]
    res["description"] = description
    return res

In [3]:
def saveJson(md_path, json_path):
    res = {}
    res["date"] = md_path[8:-3]
    res["month"] = int(res["date"][5:7])
    res["day"] = int(res["date"][-2:])
    with open(md_path, encoding="utf8") as f:
        content = f.readlines()
        for index, line in enumerate(content): 
             if line.startswith("## ") and line != "## WordCloud\n":
                lan = line[3:-1]
                repos = copy.deepcopy(content[index+1:index+11])
                for i in range(0, len(repos)):
                    repos[i] = repo_process(repos[i])
                res[lan] = copy.deepcopy(repos)
    with open(json_path, 'w', encoding='utf-8') as fw:
        fw.write(json.dumps(res, ensure_ascii=False, indent=4))

In [4]:
md_dir = "data-md"
json_dir = "data-json"
md_list = os.listdir(md_dir)
for i in range(0, len(md_list), 3):
    md_path = os.path.join(md_dir, md_list[i])
    json_subdir = os.path.join(json_dir,md_path[8:-3][5:7])
    if not os.path.exists(json_subdir):
        os.makedirs(json_subdir)
    json_path = os.path.join(json_subdir, md_list[i][:-3]+".json")
    if os.path.isfile(md_path):
        saveJson(md_path, json_path)

In [5]:
import os
json_dir = "data-json"
lan_list = ["python", "java", "unknown", "javascript", "html", "dart", "go"]

for num in range(1,12):
    if num < 10:
        num_str = "0" + str(num)
    else:
        num_str = str(num)
        
    month_res = {}
    month_res["month"] = num
    name_set = {}
    for lan in lan_list:
        month_res[lan] = []
        name_set[lan] = set()
        
    json_subdir =  os.path.join(json_dir, num_str)
    json_list = os.listdir(json_subdir)
    for i in range(0, len(json_list)):
        json_path = os.path.join(json_subdir, json_list[i])
        if os.path.isfile(json_path):
            with open(json_path, encoding="utf8") as f:
                content = json.load(f)
                for lan in lan_list:
                    if content.get(lan):
                        for repo in content[lan]:
                            if repo["name"] in name_set[lan]:
                                for exist_repo in month_res[lan]:
                                    if exist_repo["name"] == repo["name"]:
                                        if type(exist_repo["star"])==int and type(repo["star"])==int:
                                            exist_repo["star"] += repo["star"]
                                        elif type(repo["star"])==int:
                                            exist_repo["star"] = repo["star"]
                            else:
                                name_set[lan].add(repo["name"])
                                month_res[lan].append(repo)

    month_res_path = os.path.join(json_subdir, num_str+".json")
    with open(month_res_path, 'w', encoding='utf-8') as fw:
        fw.write(json.dumps(month_res, ensure_ascii=False, indent=4))

# json2csv

In [2]:
import os
import json
import pandas as pd

In [3]:
json_dir = "data-json"
csv_dir = "data-csv"
lan_list = ["python", "java", "unknown", "javascript", "html", "dart", "go"]

for num in range(1,12):
    if num < 10:
        num_str = "0" + str(num)
    else:
        num_str = str(num)
        
    json_subdir =  os.path.join(json_dir, num_str)
    month_res_path = os.path.join(json_subdir, num_str+".json")
    with open(month_res_path, encoding='utf-8') as f:
        content = json.loads(f.read())
        df = pd.DataFrame(columns = ["Name", "Language", "Description", "URL", "Stars", "Month"])
        for lan in lan_list:
            if content.get(lan):
                for repo in content[lan]:
                    if repo["star"]:
                        new_row = pd.DataFrame({"Name":repo["name"],
                                              "Language":lan,
                                              "Description":repo["description"],
                                              "URL":repo["url"],
                                              "Stars":repo["star"],
                                              "Month":content["month"]},index=[1])
                        df = df.append(new_row,ignore_index=True)
        csv_path = os.path.join(csv_dir, num_str+".csv")
        df.to_csv(csv_path, index=0, encoding='utf-8-sig')