# Create points table and author list

In [1]:
from __future__ import annotations

import os
import sys
from pathlib import Path

project_path = Path(os.getcwd()) / ".."

sys.path.append(str(project_path / "docs" / "mmteb"))

In [2]:
from create_points_table import load_data

## Point table

In [3]:
df = load_data()
df = df.groupby("GitHub").sum().astype(int)
# create a new column with the sum of the points
df["Total"] = df.sum(axis=1)
df = df.sort_values("Total", ascending=False)
# total as first column
df = df[["Total"] + [col for col in df.columns if col != "Total"]]

In [4]:
df

Unnamed: 0_level_0,Total,Bug fixes,Review PR,New dataset,Dataset annotations,Paper writing,New task,Coordination,Running Models
GitHub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KennethEnevoldsen,593,85,324,68,35,0,0,81,0
isaac-chung,433,50,194,120,1,12,2,54,0
imenelydiaker,358,24,144,120,0,0,0,70,0
awinml,302,0,2,300,0,0,0,0,0
x-tabdeveloping,239,10,32,144,0,0,12,41,0
...,...,...,...,...,...,...,...,...,...
antoniolanza1996,2,2,0,0,0,0,0,0,0
cslizc,2,0,0,2,0,0,0,0,0
hanhainebula,2,0,0,2,0,0,0,0,0
hongjin-su,2,0,2,0,0,0,0,0,0


In [5]:
print(
    df.to_latex(
        longtable=True,
        caption="Contributions by GitHub users. See \autoref{tab:authors} for the mapping between authors and GitHub handles.",
        label="tab:contributions",
    )
)

\begin{longtable}{lrrrrrrrrr}
\caption{Contributions by GitHub users. See utoref{tab:authors} for the mapping between authors and GitHub handles.} \label{tab:contributions} \\
\toprule
 & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & New task & Coordination & Running Models \\
GitHub &  &  &  &  &  &  &  &  &  \\
\midrule
\endfirsthead
\caption[]{Contributions by GitHub users. See utoref{tab:authors} for the mapping between authors and GitHub handles.} \\
\toprule
 & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & New task & Coordination & Running Models \\
GitHub &  &  &  &  &  &  &  &  &  \\
\midrule
\endhead
\midrule
\multicolumn{10}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
KennethEnevoldsen & 593 & 85 & 324 & 68 & 35 & 0 & 0 & 81 & 0 \\
isaac-chung & 433 & 50 & 194 & 120 & 1 & 12 & 2 & 54 & 0 \\
imenelydiaker & 358 & 24 & 144 & 120 & 0 & 0 & 0 & 70 & 0 \\
awinml & 302 & 0 & 2

# Contributor affiliations

In [6]:
points_to_authors = project_path / "docs" / "mmteb" / "points.md"

# extract table from markdown file
with open(points_to_authors) as f:
    lines = f.readlines()

table = False
table_lines = []
colnames = []
head_skipped = False
for line in lines:
    if not table and line.startswith("|"):
        table = True
        colnames = [c.strip() for c in line.strip().split("|")[1:-1]]
        continue
    if colnames and table and not head_skipped:
        head_skipped = True
        continue
    if table:
        table_lines.append([c.strip() for c in line.strip().split("|")[1:-1]])
    if table and line.strip() == "":
        break

In [7]:
# create a dataframe from the table
import pandas as pd

author_df = pd.DataFrame(table_lines, columns=colnames)

In [8]:
author_df

Unnamed: 0,GitHub,First name,Last name,Email,User on openreview,Affiliations
0,KennethEnevoldsen,Kenneth,Enevoldsen,kennethcenevoldsen@gmail.com,~Kenneth_Enevoldsen1,Aarhus University
1,x-tabdeveloping,Márton,Kardos,martonkardos@cas.au.dk,~Márton_Kardos1,Aarhus University
2,imenelydiaker,Imene,Kerboua,,~Imene_Kerboua1,"INSA Lyon, LIRIS"
3,wissam-sib,Wissam,Siblini,wissam.siblini92@gmail.com,~Wissam_Siblini1,Individual Contributor
4,GabrielSequeira,Gabriel,Sequeira,,,Individual Contributor
...,...,...,...,...,...,...
80,john-b-yang,John,Yang,johnby@stanford.edu,~John_Yang3,Stanford University
81,thakur-nandan,Nandan,Thakur,,~Nandan_Thakur1,University of Waterloo
82,loicmagne,Loic,Magne,~Loïc_Magne1,Individual Contributor,
83,sarahooker,Sara,Hooker,,~Sara_Hooker2,Cohere For AI


In [9]:
print(
    author_df[["GitHub", "First name", "Last name", "Affiliations"]].to_latex(
        index=False
    )
)

\begin{tabular}{llll}
\toprule
GitHub & First name & Last name & Affiliations \\
\midrule
KennethEnevoldsen & Kenneth & Enevoldsen & Aarhus University \\
x-tabdeveloping & Márton & Kardos & Aarhus University \\
imenelydiaker & Imene & Kerboua & INSA Lyon, LIRIS \\
wissam-sib & Wissam & Siblini & Individual Contributor \\
GabrielSequeira & Gabriel & Sequeira & Individual Contributor \\
schmarion & Marion & Schaeffer & Wikit \\
MathieuCiancone & Mathieu & Ciancone & Wikit \\
MartinBernstorff & Martin & Bernstorff & Aarhus University \\
staoxiao & Shitao & Xiao & Beijing Academy of Artificial Intelligence \\
ZhengLiu101 & Zheng & Liu & Beijing Academy of Artificial Intelligence \\
achibb & Aaron & Chibb & Individual Contributor \\
cassanof & Federico & Cassano & Northeastern University && Cursor AI \\
taidnguyen & Nguyen & Tai & University of Pennsylvania \\
xu3kev & Wen-Ding & Li & Cornell University \\
Rysias & Jonathan & Rystrøm & University of Oxford \\
taeminlee & Taemin & Lee & Kore

# Author list

In [10]:
github = set(author_df["GitHub"])

not_10 = []

df = df.reset_index()
# check if all github users are in the points table and has 10 total point
for gh in github:
    if gh not in set(df["GitHub"]):
        print(f"{gh} not in points table")

    if df[df["GitHub"] == gh]["Total"].values[0] < 10:
        print(f"{gh} has less than 10 points")
        not_10.append(gh)

izhx has less than 10 points
achibb has less than 10 points


In [11]:
gh

'guangyusong'

In [12]:
missing_users = [user for user in df[df["Total"] >= 10]["GitHub"] if user not in github]
print(missing_users)

['Sakshamrzt']


In [13]:
# sort author_df by total points
author_df = pd.merge(author_df, df[["GitHub", "Total"]], on="GitHub", how="left")
author_df = author_df.sort_values("Total", ascending=False)

In [14]:
# create a latex author list
# \textbf{First Last \textsuperscript{1}},
# \textbf{First Last \textsuperscript{1}},
# [if too long add \\]
# ...
# \\
# \\
# \textsuperscript{1}Aarhus University, Denmark,
# ...
# [if too long add \\]

In [15]:
author_list = []
affiations = {}

aff_id = 1
for i, row in author_df.iterrows():
    author = row["First name"] + " " + row["Last name"]
    if row["GitHub"] in not_10:
        continue
    author_str = f"\\textbf{{{author}"

    if row["Affiliations"]:
        affiliations = row["Affiliations"].split("&&")

        aff_string = ""
        for aff in affiliations:
            aff = aff.strip()
            if "N/A" in aff:
                continue
            if aff not in affiations:
                affiations[aff] = aff_id
                aff_id += 1
            aff_string += f"{affiations[aff]},"

        # remove last comma
        aff_string = aff_string[:-1]

        if aff_string:
            author_str += f"\\textsuperscript{{{aff_string}}}"
        else:
            author_str += ""

    # if row["Affiliations"] not in affiations and row["Affiliations"]:
    #     affiations[row["Affiliations"]] = aff_id
    #     aff_id += 1
    #     author_str += f"\\textsuperscript{{{affiations[row['Affiliations']]}}}"
    author_str += "}"
    author_list.append(author_str)

In [16]:
# Move last author to the end
last_author1 = "Niklas Muennighoff"
last_author_ = [a for a in author_list if last_author1 in a][0]
last_author2 = "Siva"
last_author__ = [a for a in author_list if last_author2 in a][0]
# remove from author list
author_list = [
    a for a in author_list if last_author1 not in a and last_author2 not in a
]

author_list.append(last_author__)
author_list.append(last_author_)

In [17]:
# create the latex string

latex = ""
line_length = 0
max_line_length = 85

for i, author in enumerate(author_list):
    _line_length = len(author.split("\\textsuperscript")[0])
    if line_length + _line_length > max_line_length:
        latex += "\\\\\n"
        line_length = 0
    latex += author + ", \n"
    line_length += _line_length

# add the affiliations
line_length = 0
latex += "\\\\\n"
latex += "\\\\\n"
for aff, id in affiations.items():
    if "N/A" in aff:
        continue
    _line_length = len(aff)
    if line_length + _line_length > max_line_length:
        latex += "\\\\\n"
        line_length = 0
    latex += "\\textsuperscript{" + str(id) + "}" + aff + ", \n"
    line_length += _line_length

In [18]:
print(latex)

\textbf{Kenneth Enevoldsen\textsuperscript{1}}, 
\textbf{Isaac Chung\textsuperscript{2}}, 
\textbf{Imene Kerboua\textsuperscript{3}}, 
\\
\textbf{Ashwin Mathur\textsuperscript{2}}, 
\textbf{Márton Kardos\textsuperscript{1}}, 
\textbf{David Stap\textsuperscript{4}}, 
\textbf{Jay Gala\textsuperscript{5}}, 
\\
\textbf{Wissam Siblini\textsuperscript{2}}, 
\textbf{Dominik Krzemiński\textsuperscript{8}}, 
\textbf{Genta Indra Winata\textsuperscript{2}}, 
\\
\textbf{Saba Sturua\textsuperscript{9}}, 
\textbf{Saiteja Utpala\textsuperscript{10}}, 
\textbf{Orion Weller\textsuperscript{11}}, 
\textbf{Mathieu Ciancone\textsuperscript{12}}, 
\\
\textbf{Marion Schaeffer\textsuperscript{12}}, 
\textbf{Gabriel Sequeira\textsuperscript{2}}, 
\textbf{Diganta Misra\textsuperscript{13,14}}, 
\\
\textbf{Vaibhav Adlakha\textsuperscript{15,16}}, 
\textbf{Shreeya Dhakal\textsuperscript{2}}, 
\textbf{Jonathan Rystrøm\textsuperscript{17}}, 
\\
\textbf{Roman Solomatin\textsuperscript{18}}, 
\textbf{Chenghao Xiao\t

In [19]:
# authors with 10 points or more

{g for g in github if g not in not_10}

# to a string @author

for g in github:
    if g in not_10:
        continue
    print(f"@{g} ")

@orionw 
@ZhengLiu101 
@staoxiao 
@xiamengzhou 
@xu3kev 
@swj0419 
@Art3mis07 
@sted97 
@vaibhavad 
@isaac-chung 
@taeminlee 
@Samoed 
@mrshu 
@Muennighoff 
@kwojtasi 
@jankounchained 
@imenelydiaker 
@AlexeyVatolin 
@mariyahendriksen 
@KennethEnevoldsen 
@akshita-sukhlecha 
@awinml 
@jaygala24 
@digantamisra98 
@gentaiscool 
@Rysias 
@MartinBernstorff 
@jupyterjazz 
@davidstap 
@Alenush 
@MathieuCiancone 
@xhluca 
@rafalposwiata 
@x-tabdeveloping 
@ab1992ao 
@artemsnegirev 
@jphme 
@slvnwhrl 
@hgissbkh 
@HLasse 
@Ruqyai 
@bp-high 
@ljvmiranda921 
@violenil 
@malteos 
@rasdani 
@asparius 
@simon-clematide 
@dokato 
@mmhamdy 
@john-b-yang 
@henilp105 
@dwzhu-pku 
@tomaarsen 
@sarahooker 
@manandey 
@ManuelFay 
@sivareddyg 
@thakur-nandan 
@Akash190104 
@shreeya-dhakal 
@PranjalChitale 
@schmarion 
@ShawonAshraf 
@loicmagne 
@KranthiGV 
@gowitheflow-1998 
@dipam7 
@rbroc 
@ABorghini 
@jordiclive 
@Andrian0s 
@bjoernpl 
@taidnguyen 
@MariyaTikhonova 
@wissam-sib 
@cassanof 
@SaitejaUtpala

In [20]:
# get openreview ids for author_df

# filter out authors with less than 10 points
tt = author_df[author_df["GitHub"].isin({g for g in github if g not in not_10})]

t = tt[["First name", "Last name", "User on openreview"]]

for row in t.iterrows():
    print(row[1]["First name"], row[1]["Last name"], row[1]["User on openreview"])

Kenneth Enevoldsen ~Kenneth_Enevoldsen1
Isaac Chung ~Isaac_Kwan_Yin_Chung1
Imene Kerboua ~Imene_Kerboua1
Ashwin Mathur ~Ashwin_Mathur1
Márton Kardos ~Márton_Kardos1
David Stap ~David_Stap
Jay Gala ~Jay_Gala1
Wissam Siblini ~Wissam_Siblini1
Niklas Muennighoff ~Niklas_Muennighoff1
Dominik Krzemiński ~Dominik_Krzemiński1
Genta Indra Winata ~Genta_Indra_Winata1
Saba Sturua ~Saba_Sturua1
Saiteja Utpala ~Saiteja_Utpala1
Orion Weller ~Orion_Weller1
Mathieu Ciancone ~Mathieu_Ciancone1
Marion Schaeffer ~Marion_Schaeffer1
Gabriel Sequeira 
Diganta Misra ~Diganta_Misra1
Vaibhav Adlakha ~Vaibhav_Adlakha1
Shreeya Dhakal 
Jonathan Rystrøm ~Jonathan_Rystrøm1
Roman Solomatin ~Roman_Solomatin1
Siva Reddy ~Siva_Reddy1
Chenghao Xiao ~Chenghao_Xiao1
Ömer Çağatan ~Ömer_Veysel_Çağatan1
Akash Kundu ~Akash_Kundu2
Martin Bernstorff ~Martin_Bernstorff1
Shitao Xiao ~Shitao_Xiao1
Akshita Sukhlecha ~Akshita_Sukhlecha1
Bhavish Pahwa ~Bhavish_Pahwa1
Rafał Poświata ~Rafał_Poświata1
Kranthi Kiran GV ~Kranthi_Kiran_GV1

In [21]:
row[1]

First name                      Federico
Last name                        Cassano
User on openreview    ~Federico_Cassano1
Name: 11, dtype: object