In [1]:
import os
from hashlib import sha256
import pandas as pd

In [2]:
repo_folder = "book/website"

print(os.listdir(os.getcwd() + "/" + repo_folder))

['community-handbook', 'collaboration', 'LICENSE.md', 'requirements.txt', 'ethical-research', 'runtime.txt', '_bibliography', 'welcome.md', '_static', '_redirects', '_toc.yml', 'project-design', 'figures', '_config.yml', 'scripts', 'communication', 'afterword', 'reproducible-research', 'analytics']


In [3]:
# https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python
# https://github.com/earnestt1234/seedir

import seedir as sd
sd.seedir(repo_folder, style='emoji')

📁 website/
├─📁 community-handbook/
│ ├─📁 coworking/
│ │ ├─📄 coworking-organisation.md
│ │ ├─📄 coworking-motivation.md
│ │ ├─📄 coworking-weekly.md
│ │ └─📄 coworking-collabcafe.md
│ ├─📄 acknowledgement.md
│ ├─📄 coworking.md
│ ├─📄 templates.md
│ ├─📁 newsletters/
│ │ ├─📄 newsletters-style.md
│ │ ├─📄 newsletters-template.md
│ │ └─📄 newsletters-process.md
│ ├─📁 contributing/
│ │ ├─📄 contributing-templates.md
│ │ └─📄 contributing-workflow.md
│ ├─📄 bookdash.md
│ ├─📄 coc.md
│ ├─📁 style/
│ │ ├─📄 style-figures.md
│ │ ├─📄 style-crossref.md
│ │ ├─📄 style-more-styling.md
│ │ ├─📄 style-custom-styling.md
│ │ └─📄 style-citing.md
│ ├─📄 community-handbook.md
│ ├─📁 consistency/
│ │ ├─📄 consistency-structure.md
│ │ ├─📄 consistency-formatting.md
│ │ └─📄 consistency-language.md
│ ├─📄 style.md
│ ├─📄 contributing.md
│ ├─📁 templates/
│ │ ├─📄 template-bookdash-github.md
│ │ ├─📄 template-coworking-collabcafe.md
│ │ ├─📄 template-fireside-chat.md
│ │ ├─📄 template-newsletter-draft.md
│ │ ├─📄 template-bookdash-feedba

In [4]:
%%time
file_type = ".md"
license = "CC-BY"
git_url_base = "https://github.com/alan-turing-institute/the-turing-way/blob/main/"
web_url_base = "https://the-turing-way.netlify.app/"

file_list, file_path_list, guidebook_list, git_url_list, web_url_list = [], [], [], [], []

for path, dirs, files in os.walk(repo_folder):
    for f in files:
        if f.endswith(file_type) and not(path.endswith("website")):
            file_list.append(f)
            file_path = path + "/" + f
            file_path_list.append(file_path)
            guidebook_list.append(path.split("/")[2])
            git_url_list.append(git_url_base + file_path.replace("the-turing-way/", ""))
            web_url_list.append(web_url_base + file_path.split('book/website/')[-1].replace('.md', '.html'))

CPU times: user 3.23 ms, sys: 5.43 ms, total: 8.66 ms
Wall time: 8.64 ms


In [5]:
pd.set_option('max_colwidth', -1)

df = pd.DataFrame({'file': file_list,
                   'file_path': file_path_list,
                   'guidebook': guidebook_list,
                   'git_url': git_url_list,
                   'web_url': web_url_list 
                  })

df['file_id'] = df['file'].apply(lambda x: sha256(x.encode('utf-8')).hexdigest())
df['file_type'] = file_type
df['license'] = license

df.head()

  pd.set_option('max_colwidth', -1)


Unnamed: 0,file,file_path,guidebook,git_url,web_url,file_id,file_type,license
0,acknowledgement.md,book/website/community-handbook/acknowledgement.md,community-handbook,https://github.com/alan-turing-institute/the-turing-way/blob/main/book/website/community-handbook/acknowledgement.md,https://the-turing-way.netlify.app/community-handbook/acknowledgement.html,aa9f86989c9ad3187ce5c585b0b458f640b4e09a1e4f04abbe32ff1bcb69b525,.md,CC-BY
1,coworking.md,book/website/community-handbook/coworking.md,community-handbook,https://github.com/alan-turing-institute/the-turing-way/blob/main/book/website/community-handbook/coworking.md,https://the-turing-way.netlify.app/community-handbook/coworking.html,2b460c305634fdb6e37a334c955f5f90428936623ae0ac2171c3bf0653c6b85f,.md,CC-BY
2,templates.md,book/website/community-handbook/templates.md,community-handbook,https://github.com/alan-turing-institute/the-turing-way/blob/main/book/website/community-handbook/templates.md,https://the-turing-way.netlify.app/community-handbook/templates.html,1be6c5b917888c1ac8a84de28ee5381f9531a71e87ae3bb19f64c2101ba3f31c,.md,CC-BY
3,bookdash.md,book/website/community-handbook/bookdash.md,community-handbook,https://github.com/alan-turing-institute/the-turing-way/blob/main/book/website/community-handbook/bookdash.md,https://the-turing-way.netlify.app/community-handbook/bookdash.html,8b40cf2f250a750b743a4935dcd9ce5e885c3b145e39a644eb897b03ea387188,.md,CC-BY
4,coc.md,book/website/community-handbook/coc.md,community-handbook,https://github.com/alan-turing-institute/the-turing-way/blob/main/book/website/community-handbook/coc.md,https://the-turing-way.netlify.app/community-handbook/coc.html,9cc84f91bc433e58c6453082eb60c0920c667eb16a32f153071fd912d351b2a8,.md,CC-BY


In [6]:
# TODO: programmatically add contributor info to dataframe

# Git Logs

In [7]:
! git log --pretty=format:'"%h","%an","%ae","%aD","%s"' book/website/community-handbook/community-handbook.md > output.txt
    

In [8]:
column_names = ["commit", "name", "email", "timestamp", "description"]
df_user = pd.read_csv("output.txt", header=None, names=column_names)
df_user['changes'] = df_user['timestamp'].apply(lambda x: x.split(" ")[-1])
df_user['timestamp'] = df_user['timestamp'].apply(lambda x: x[:-6])

In [9]:
# df_user['email'].unique().tolist()

In [10]:
# df_user.groupby('name')['email'].apply(lambda x: x.unique().tolist()).to_dict()