In [36]:
from os import listdir
from os.path import isfile, join
from os import environ
import _pickle as pkl
from metadata import Gender
import pandas as pd
from itertools import chain
from _name_classification.nametools import process_str
from _name_classification.classifyname import NC
from _storage.storage import FileDir

In [37]:
train_dirpath = join(environ["ARXIV_DIR"], "data.arxiv.io", "data", "cs")
train_files = []
fd = FileDir()
for y in range(1994,2019):
    file_name = join(train_dirpath, str(y) + ".txt")
    train_files.append(file_name)

In [38]:
years_data = []
for file in train_files:
    data = pd.read_csv(file, sep = "\t", header = None, 
                    names = ["id", "categories", "authors", "title", "year", "abstract"])
    years_data.append(data)
df = pd.concat(years_data)        

In [39]:
year_range = [1994,2019]
abstracts = []
for i in range(year_range[0],year_range[1]):
    year = df[df["year"] == i]
    for i in year.iterrows():
        abstracts.append(i[1].abstract)

In [40]:
fd.save_pickle(abstracts, "arxiv-abstracts")

In [41]:
from _storage.storage import FileDir
fd = FileDir()
known_f = fd.load_pickle("known_f_arxiv_n")
known_m = fd.load_pickle("known_m_arxiv_n")
known_m.remove("Lascarides, Alex")
known_f.add("Lascarides, Alex")

In [42]:
import re
def is_initials(name, sp=", "):
    
    fn = name.split(sp)
    if len(fn) < 2:
        return True
    fn = fn[1].strip()
    regex = re.compile(r"\w+\.", re.IGNORECASE)
    no_initials = regex.sub("", fn).strip()
    if len(no_initials) < 2:
        return True
    return False
        
def init_match(inits, name):
    inits = inits.replace("Dr.","")
    inits = inits.strip()
    if(len(inits) == 0):
        return 0
    name = name.strip()
    inits_split = inits.split(" ")
    name_split = name.split(" ")
    if len(inits_split) == len(name_split):
        match = True
        for idx, i in enumerate(inits_split):
            if i.strip()[0] != name_split[idx].strip()[0]:
                match = False
        if match == True:
            return 1 #confident match
    if inits[0] == name_split[0]:
        return 2 # maybe match
    inits_set = set()
    name_set = set()
    for letter in inits:
        if letter.isupper():
            inits_set.add(letter)
    for letter in name_set:
        if letter.isupper():
            name_set.add(letter)
    if inits_set == name_set:
        return 1 # confident match
    for letter in inits_set:
        if letter in name_set:
            return 3 # maybee
    return 0 #no match

from collections import defaultdict
male_last_names = defaultdict(list)
for name in known_m:
    if len(name.split(", ")) < 2:
        continue
    last, first = name.split(", ")
    male_last_names[last].append(first)
    
female_last_names = defaultdict(list)
for name in known_f:
    if len(name.split(", ")) < 2:
        continue
    last, first = name.split(", ")
    female_last_names[last].append(first)

def best_match(name):
    if len(name.split(", ")) < 2:
        return (None, None)
    last, first = name.split(", ")
    m = last in male_last_names
    f = last in female_last_names
    if m and not f:
        names = male_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                return (last + ", " + n, Gender.male)
       
    if f and not m:
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                return (last + ", " + n, Gender.male)
    if f and m:
        
        match_f = 0
        match_m = 0
        names = male_last_names[last]
        best_match = (None, None)
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_m):
                match_m = mtch
                best_match = (last + ", " + n, Gender.male)
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_f):
                match_f = mtch
                if mtch > match_m:
                    best_match = (last + ", " + n, Gender.female)
        if match_f > match_m:
            return best_match
        if match_m > match_f:
            return best_match
    return (None, None)
    

In [14]:
print(best_match("Reyle, U."))

('Reyle, Uwe', <Gender.male: 0>)


In [15]:
df.set_index("id")

Unnamed: 0_level_0,categories,authors,title,year,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cmp-lg/9404001,cmp-lg cs.CL,"Schabes, Yves; Shieber, Stuart M.",An Alternative Conception of Tree-Adjoining De...,1994,The precise formulation of derivation for tree...
cmp-lg/9404002,cmp-lg cs.CL,"Shieber, Stuart M.",Lessons from a Restricted Turing Test,1994,We report on the recent Loebner prize competit...
cmp-lg/9404003,cmp-lg cs.CL,"Shieber, Stuart M.",Restricting the Weak-Generative Capacity of Sy...,1994,The formalism of synchronous tree-adjoining gr...
cmp-lg/9404004,cmp-lg cs.CL,"Covington, Michael A.",An Empirically Motivated Reinterpretation of D...,1994,Dependency grammar is usually interpreted as e...
cmp-lg/9404005,cmp-lg cs.CL,"Johnson, Mark",Memoization in Constraint Logic Programming,1994,This paper shows how to apply memoization ( ca...
cmp-lg/9404006,cmp-lg cs.CL,"CHANDLER-BURNS, R. M.",SPANISH 1992 ( S92 ) : corpus-based analysis o...,1994,S92 research was begun in 1987 to analyze word...
cmp-lg/9404007,cmp-lg cs.CL,"Bouma, Gosse; van Noord, Gertjan",Constraint-Based Categorial Grammar,1994,We propose a generalization of Categorial Gram...
cmp-lg/9404008,cmp-lg cs.CL,"Shieber, Stuart M.; Schabes, Yves; Pereira, Fe...",Principles and Implementation of Deductive Par...,1994,We present a system for generating parsers bas...
cmp-lg/9404009,cmp-lg cs.CL,"Dalrymple, Mary; Lamping, John; Pereira, Ferna...",A Deductive Account of Quantification in LFG,1994,The relationship between Lexical-Functional Gr...
cmp-lg/9404010,cmp-lg cs.CL,"Dalrymple, Mary; Lamping, John; Pereira, Ferna...",Intensional Verbs Without Type-Raising or Lexi...,1994,We present an analysis of the semantic interpr...


In [250]:
#gender_df.set_index("id")
gender_df

Unnamed: 0,id,authors,genders,title,year


In [286]:
all_authors = df["id"].values
all_authors

array(['cmp-lg/9404001', 'cmp-lg/9404002', 'cmp-lg/9404003', ...,
       1801.04726, 1801.0481300000001, 1801.0487100000003], dtype=object)

In [43]:
c = ["id", "authors", "genders", "title", "year"]
gender_df = pd.DataFrame(columns=c)
k = 0
for row in df.iterrows():
    new_entry = dict()
    for col in ["id", "authors", "title", "year", "abstract"]:
        new_entry[col] = row[1][col]
    authors = row[1]["authors"].split("; ")
    genders = []
    correct_authors = []
    for a in authors:
        if is_initials(a):
            #print(a)
            #print(best_match(a))
            best_m, g = best_match(a)
            if best_m != None:
                correct_authors.append(best_m)
                genders.append(g)
                k += 1
                continue
                #print(a, best_m)
        correct_authors.append(a)
        if(a in known_m):
            genders.append(Gender.male)
        elif a in known_f:
            genders.append(Gender.female)
        else:
            genders.append(Gender.unknown)
    if len(correct_authors) != len(genders):
        print("error")
    new_entry["authors"] = correct_authors
    new_entry["genders"] = genders
    gender_df = gender_df.append(new_entry, ignore_index=True)
gender_df.set_index("id")
print(k)   

226


In [356]:
gender_df

Unnamed: 0,id,authors,genders,title,year,abstract
0,cmp-lg/9404001,"[Schabes, Yves, Shieber, Stuart M.]","[Gender.male, Gender.male]",An Alternative Conception of Tree-Adjoining De...,1994,The precise formulation of derivation for tree...
1,cmp-lg/9404002,"[Shieber, Stuart M.]",[Gender.male],Lessons from a Restricted Turing Test,1994,We report on the recent Loebner prize competit...
2,cmp-lg/9404003,"[Shieber, Stuart M.]",[Gender.male],Restricting the Weak-Generative Capacity of Sy...,1994,The formalism of synchronous tree-adjoining gr...
3,cmp-lg/9404004,"[Covington, Michael A.]",[Gender.male],An Empirically Motivated Reinterpretation of D...,1994,Dependency grammar is usually interpreted as e...
4,cmp-lg/9404005,"[Johnson, Mark]",[Gender.male],Memoization in Constraint Logic Programming,1994,This paper shows how to apply memoization ( ca...
5,cmp-lg/9404006,"[CHANDLER-BURNS, R. M.]",[Gender.unknown],SPANISH 1992 ( S92 ) : corpus-based analysis o...,1994,S92 research was begun in 1987 to analyze word...
6,cmp-lg/9404007,"[Bouma, Gosse, van Noord, Gertjan]","[Gender.male, Gender.male]",Constraint-Based Categorial Grammar,1994,We propose a generalization of Categorial Gram...
7,cmp-lg/9404008,"[Shieber, Stuart M., Schabes, Yves, Pereira, F...","[Gender.male, Gender.male, Gender.male]",Principles and Implementation of Deductive Par...,1994,We present a system for generating parsers bas...
8,cmp-lg/9404009,"[Dalrymple, Mary, Lamping, John, Pereira, Fern...","[Gender.female, Gender.male, Gender.male, Gend...",A Deductive Account of Quantification in LFG,1994,The relationship between Lexical-Functional Gr...
9,cmp-lg/9404010,"[Dalrymple, Mary, Lamping, John, Pereira, Fern...","[Gender.female, Gender.male, Gender.male, Gend...",Intensional Verbs Without Type-Raising or Lexi...,1994,We present an analysis of the semantic interpr...


In [44]:
fd.save_pickle(gender_df, "arxivdf")

In [45]:
my_new_unk_n_papers = defaultdict(int)
n_papers = defaultdict(int)
pp = dict()
all_auths = set()
for row in gender_df.iterrows():
    genders = row[1]["genders"]
    authors = row[1]["authors"]
    paper = row[1]["title"]
    for i,g in enumerate(genders):
        n_papers[authors[i]] += 1
        if g == Gender.unknown:
            my_new_unk_n_papers[authors[i]] += 1
            pp[authors[i]] = paper
    for a in authors:
        all_auths.add(a)
            

In [46]:
my_new_unk = dict()
for u, n in my_new_unk_n_papers.items():
    if n >= 1:
        my_new_unk[u] = pp[u]

In [47]:
my_new_unk

{'CHANDLER-BURNS, R. M.': 'SPANISH 1992 ( S92 ) : corpus-based analysis of present-day Spanish for medical purposes',
 'Smith, F. J.': 'Improving Statistical Language Model Performance with Automatically Generated Word Hierarchies',
 'Chen, Kuang-hua': 'Automatic Identification of Subjects for Textual Documents in Digital Libraries',
 'Srinivas, B.': 'Heuristics and Parse Ranking',
 'Kim, Geunbae Lee Jong-Hyeok Lee Kyunghee': 'Phoneme-level speech and natural language intergration for agglutinative languages',
 'Schubert, Lenhart': 'Knowledge Representation for Lexical Semantics : Is Standard First Order Logic Enough ?',
 'Vagelatos, A.': 'An electronic dictionary as a basis for NLP tools : The Greek case',
 'Triantopoulou, T.': 'Utilization of a Lexicon for Spelling Correction in Modern Greek',
 'Tsalidis, C.': 'Utilization of a Lexicon for Spelling Correction in Modern Greek',
 'Christodoulakis, D.': 'Utilization of a Lexicon for Spelling Correction in Modern Greek',
 'Lee, Geunbae':

In [35]:
with open("names.csv","w") as f:
    for auth, title in my_new_unk_n_papers.items():
        if is_initials(auth):
            continue
        f.write(auth.replace(",",";")+"\t"+pp[auth]+"\n")

In [48]:
len(my_new_unk) / len(all_auths)

0.1104212116126867

In [49]:
unk = set()
for auth in authors:
    auth = auth.strip()
    if auth in known_f or auth in known_m:
        continue
    else:
        unk.add(auth)

In [50]:
len(authors)

7

In [53]:
len(unk)

0

In [52]:
len(my_new_unk)

1316

In [30]:
from collections import defaultdict
male_last_names = defaultdict(list)
for name in known_m:
    if len(name.split(", ")) < 2:
        print(name)
        continue
    last, first = name.split(", ")
    male_last_names[last].append(first)
    
female_last_names = defaultdict(list)
for name in known_f:
    if len(name.split(", ")) < 2:
        print(name)
        continue
    last, first = name.split(", ")
    female_last_names[last].append(first)

Robert


['Kirchmeier-Andersen,']
['Barbara']
['Eklund,']
['Di Eugenio,']
['Sabine']


In [48]:
a = set()
a.add("B")
a.add("C")
b = set()
b.add("C")
b.add("B")
a==b

True

In [56]:
def init_match(inits, name):
    inits.replace("Dr.","")
    inits_split = inits.split(" ")
    name_split = name.split(" ")
    if len(inits_split) == len(name_split):
        match = True
        for idx, i in enumerate(inits_split):
            if i.strip()[0] != name_split[idx].strip()[0]:
                match = False
        if match == True:
            return 1 #confident match
    if inits[0] == name_split[0]:
        return 2 # maybe match
    inits_set = set()
    name_set = set()
    for letter in inits:
        if letter.isupper():
            inits_set.add(letter)
    for letter in name_set:
        if letter.isupper():
            name_set.add(letter)
    if inits_set == name_set:
        return 1 # confident match
    for letter in inits_set:
        if letter in name_set:
            return 3 # maybee
    return 0 #no match
    
    

In [52]:
#tests
print(match("M. R.", "Morgan R."))

1


In [71]:
i = 0
j = 0
k = 0
for name in just_initials:
    if len(name.split(", ")) < 2:
        #print(name)
        continue
    last, first = name.split(", ")
    m = last in male_last_names
    f = last in female_last_names
    if m and not f:
        names = male_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                print(name, "===", n)
                known_m.add(name)
                i += 1
                break
       
    if f and not m:
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                #print(name, "===", n)
                known_f.add(name)
                j += 1
                break
    if f and m:
        
        match_f = 0
        match_m = 0
        names = male_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_m):
                match_m = mtch
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_f):
                match_f = mtch
        if match_f > match_m:
            k+=1
            known_m.add(name)
        if match_m > match_f:
            k+=1
            known_f.add(f)
            
print(i, "sure males")    
print(j, "sure females")
print(k, "probably")

Backofen, R. === Rolf
Lopez-Gazpio, I. === Iñigo
Tur, G. === Gokhan
Besacier, L. === Laurent
Chicoisne, G. === Guillaume
El-Shishtawy, T. === Tarek
Gerow, A. === Aaron
Neumann, G. === Guenter
Marquez, L. === Lluis
Bentz, C. === Chris
Stamatopoulos, P. === Panagiotis
Androutsopoulos, I. === Ion
Spyropoulos, C. D. === Constantine D.
Dodds, P. S. === Peter Sheridan
Lopes, A. === António
Danforth, C. M. === Christopher M.
Thompson, C. === Christopher
Agirre, E. === Eneko
McMahon, M. T. === Matthew T.
Daelemans, W. === Walter
Paliouras, G. === George
Adhikari, R. === Ronojoy
Stone, M. === Matthew
Yvon, F. === Francois
Utiyama, M. === Masao
Petroni, F. === Filippo
Chenfour, N. === Noureddine
Padro, L. === Lluis
Keane, M. T. === Mark T.
Mueller, M. === Martin
Bagrow, J. P. === James P.
Sangal, R. === Rajeev
Kryssanov, V. V. === Victor V.
Benjumea, J. === Juan
Goodman, B. === Benjamin
Kwak, K. S. === Kyung Sup
Allahverdyan, A. E. === Armen E.
Lande, D. === Dmitry
Agnes, F. === Frederic
Semple,

In [None]:
known_m.

In [81]:
import os
from enum import Enum
import pandas as pd
import re
import numpy as np
from collections import Counter
import html
import matplotlib.pyplot as plt
import seaborn as sns
from metadata.metadata import ACL_metadata
from metadata import Gender
from collections import defaultdict
from itertools import chain

%matplotlib inline

In [63]:
unk

{'Filho, Demival Vasques',
 'R, Kanagavalli V',
 'Lee, Sang-goo',
 'Li, Xiao-li',
 'Beltagy, I.',
 'Walker, M. A.',
 'Yang, Zhenglu',
 'Zhang, Huimin',
 'Bastan, Mohadeseh',
 'Karray, H.',
 'Zhai, Chengxiang',
 'Annervaz, K. M.',
 'Martinez-Barco, P.',
 'Stamatopoulos, P.',
 'Sunitha, Dr K. V. N.',
 'Ferrero, J.',
 'Yao, Y.',
 'Chen, Kehai',
 'Mitra, Rajarshee',
 'Pabico, Jaderick P.',
 'Xiao, Xiong',
 'Ferrer-i-Cancho, R.',
 'Hsu, M. J.',
 'Eklund,',
 'Pustejovsky, J.',
 'Liu, Hairong',
 'Zhang, Xi',
 'Petroni, F.',
 'Hu, Wenbo',
 'Hong, Liangjie',
 'Semple, S.',
 'Chen, Danlu',
 'Trivedi, Priyansh',
 'Wei, Si',
 'Sun, Chonglin',
 'Ehsan, Upol',
 'Geng, Shiqiang',
 'Su, Hang',
 'Lu, Hsiang-Hung',
 'Ouyang, Sixun',
 'Wu, L.',
 'Cho, Sungzoon',
 'Nair, Surag',
 'Wang, Hongsu',
 'Luo, Juan',
 'Yan, Shuicheng',
 'Yang, Baosong',
 'Tivnan, B. F.',
 'Seo, Minjoon',
 'Utiyama, M.',
 'Marazzato, R.',
 'Bar-Yam, Yaneer',
 'Berri, J.',
 'Achary, K. K.',
 'Wang, Chunqi',
 'Wadhawan, Kahini',
 'G

In [54]:
from metadata.metadata import ACL_metadata
acl = ACL_metadata()
df = acl.meta_df
known_m = acl.known_m
known_f = acl.known_f

Remember to use acl.modeling_files and modeling_df for topic modeling


In [55]:
df.columns

Index(['authors', 'genders', 'title', 'venue', 'year'], dtype='object')

In [56]:
import re
def is_initials(name, sp=","):
    
    fn = name.split(sp)
    if len(fn) < 2:
        return True
    fn = fn[1].strip()
    regex = re.compile(r"\w+\.", re.IGNORECASE)
    no_initials = regex.sub("", fn).strip()
    if len(no_initials) < 2:
        return True
    return False
        
def init_match(inits, name):
    inits = inits.replace("Dr.","")
    inits = inits.strip()
    if(len(inits) == 0):
        return 0
    name = name.strip()
    inits_split = inits.split(" ")
    name_split = name.split(" ")
    if len(inits_split) == len(name_split):
        match = True
        for idx, i in enumerate(inits_split):
            if i.strip()[0] != name_split[idx].strip()[0]:
                match = False
        if match == True:
            return 1 #confident match
    if inits[0] == name_split[0]:
        return 2 # maybe match
    inits_set = set()
    name_set = set()
    for letter in inits:
        if letter.isupper():
            inits_set.add(letter)
    for letter in name_set:
        if letter.isupper():
            name_set.add(letter)
    if inits_set == name_set:
        return 1 # confident match
    for letter in inits_set:
        if letter in name_set:
            return 3 # maybee
    return 0 #no match

from collections import defaultdict
male_last_names = defaultdict(list)
for name in known_m:
    if len(name.split(",")) != 2:
        continue
    last, first = name.split(",")
    male_last_names[last].append(first)
    
female_last_names = defaultdict(list)
for name in known_f:
    if len(name.split(",")) != 2:
        continue
    last, first = name.split(",")
    female_last_names[last].append(first)

def best_match(name):
    if len(name.split(",")) != 2:
        return (None, None)
    last, first = name.split(",")
    m = last in male_last_names
    f = last in female_last_names
    if m and not f:
        names = male_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                return (last + "," + n, Gender.male)
       
    if f and not m:
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if mtch != 0:
                return (last + "," + n, Gender.male)
    if f and m:
        
        match_f = 0
        match_m = 0
        names = male_last_names[last]
        best_match = (None, None)
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_m):
                match_m = mtch
                best_match = (last + ", " + n, Gender.male)
        names = female_last_names[last]
        for n in names:
            mtch = init_match(first, n)
            if(mtch > match_f):
                match_f = mtch
                if mtch > match_m:
                    best_match = (last + ", " + n, Gender.female)
        if match_f > match_m:
            return best_match
        if match_m > match_f:
            return best_match
    return (None, None)


c=['authors', 'genders', 'title', 'venue', 'year']
gender_df = pd.DataFrame(columns=c)
k = 0
for row in df.iterrows():
    new_entry = dict()
    for col in ["venue", "title", "year"]:
        new_entry[col] = row[1][col]
    new_entry["id"] = row[1].name
    authors = row[1]["authors"]
    genders = []
    correct_authors = []
    for a in authors:
        if is_initials(a,","):
            #print(a)
            #print(best_match(a))
            best_m, g = best_match(a)
            if best_m != None:
                correct_authors.append(best_m)
                genders.append(g)
                k += 1
                continue
                #print(a, best_m)
           
        correct_authors.append(a)
        if(a in known_m):
            genders.append(Gender.male)
        elif a in known_f:
            genders.append(Gender.female)
        else:
            genders.append(Gender.unknown)
    new_entry["authors"] = correct_authors
    new_entry["genders"] = genders
    gender_df = gender_df.append(new_entry, ignore_index=True)
gender_df.set_index("id")
print(k)   


88


In [57]:
my_new_unk = dict()
all_auths = set()
for row in gender_df.iterrows():
    genders = row[1]["genders"]
    authors = row[1]["authors"]
    paper = row[1]["title"]
    for i,g in enumerate(genders):
        if g == Gender.unknown:
            my_new_unk[authors[i]] = paper
    for a in authors:
        all_auths.add(a)

In [58]:
len(my_new_unk)

973

In [59]:
len(acl.unk)

1003

In [60]:
my_new_unk = dict()
all_auths = set()
for row in acl.meta_df.iterrows():
    genders = row[1]["genders"]
    authors = row[1]["authors"]
    paper = row[1]["title"]
    for i,g in enumerate(genders):
        if g == Gender.unknown:
            my_new_unk[authors[i]] = paper
    for a in authors:
        all_auths.add(a)

In [62]:
fd.save_pickle(gender_df,"acldf")

In [335]:
c=['authors', 'genders', 'title', 'venue', 'year']
gender_df = pd.DataFrame(columns=c)
k = 0
for row in df.iterrows():
    new_entry = dict()
    for col in ["venue", "title", "year"]:
        new_entry[col] = row[1][col]
    new_entry["id"] = row[1].name
    authors = row[1]["authors"]
    genders = []
    correct_authors = []
    for a in authors:
        correct_authors.append(a)
        if(a in known_m):
            genders.append(Gender.male)
        elif a in known_f:
            genders.append(Gender.female)
        else:
            genders.append(Gender.unknown)
    new_entry["authors"] = correct_authors
    new_entry["genders"] = genders
    gender_df = gender_df.append(new_entry, ignore_index=True)
gender_df.set_index("id")

Unnamed: 0_level_0,authors,genders,title,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E03-1001,"[Oard,Douglasw]",[Gender.male],Multilingual Access To Large Spoken Archives (...,EACL,2003
E03-1002,"[Henderson,Jamesb]",[Gender.male],Neural Network Probability Estimation For Broa...,EACL,2003
E03-1003,"[Burstein,Jill, Wolska,Magdalena]","[Gender.female, Gender.female]",Toward Evaluation Of Writing Style: Overly Rep...,EACL,2003
E03-1004,"[Cmejrek,Martin, Curin,Jan, Havelka,Jiri]","[Gender.male, Gender.male, Gender.male]",Czech-English Dependency Tree-Based Machine Tr...,EACL,2003
E03-1005,"[Bod,Rens]",[Gender.male],An Efficient Implementation Of A New DOP Model,EACL,2003
E03-1006,"[Smets,Martine, Gamon,Michael, Corstonoliver,S...","[Gender.female, Gender.male, Gender.male, Gend...",French Amalgam: A Quick Adaptation Of A Senten...,EACL,2003
E03-1007,"[Ueffing,Nicola, Ney,Hermann]","[Gender.female, Gender.male]",Using POS Information For SMT Into Morphologic...,EACL,2003
E03-1008,"[Steedman,Mark, Osborne,Miles, Sarkar,Anoop, C...","[Gender.male, Gender.male, Gender.male, Gender...",Bootstrapping Statistical Parsers From Small D...,EACL,2003
E03-1009,"[Clark,Alexander]",[Gender.male],Combining Distributional And Morphological Inf...,EACL,2003
E03-1010,"[Yasuda,Keiji, Sugaya,Fumiaki, Takezawa,Toshiy...","[Gender.male, Gender.male, Gender.male, Gender...",Automatic Evaluation For A Palpable Measure Of...,EACL,2003


In [5]:
my_new_unk = dict()
all_auths = set()
for row in acl.meta_df.iterrows():
    genders = row[1]["genders"]
    authors = row[1]["authors"]
    paper = row[1]["title"]
    for i,g in enumerate(genders):
        if g == Gender.unknown:
            my_new_unk[authors[i]] = paper
    for a in authors:
        all_auths.add(a)

NameError: name 'acl' is not defined

In [61]:
gender_df

Unnamed: 0,authors,genders,title,venue,year,id
0,"[Oard,Douglasw]",[Gender.male],Multilingual Access To Large Spoken Archives (...,EACL,2003,E03-1001
1,"[Henderson,Jamesb]",[Gender.male],Neural Network Probability Estimation For Broa...,EACL,2003,E03-1002
2,"[Burstein,Jill, Wolska,Magdalena]","[Gender.female, Gender.female]",Toward Evaluation Of Writing Style: Overly Rep...,EACL,2003,E03-1003
3,"[Cmejrek,Martin, Curin,Jan, Havelka,Jiri]","[Gender.male, Gender.male, Gender.male]",Czech-English Dependency Tree-Based Machine Tr...,EACL,2003,E03-1004
4,"[Bod,Rens]",[Gender.male],An Efficient Implementation Of A New DOP Model,EACL,2003,E03-1005
5,"[Smets,Martine, Gamon,Michael, Corstonoliver,S...","[Gender.female, Gender.male, Gender.male, Gend...",French Amalgam: A Quick Adaptation Of A Senten...,EACL,2003,E03-1006
6,"[Ueffing,Nicola, Ney,Hermann]","[Gender.female, Gender.male]",Using POS Information For SMT Into Morphologic...,EACL,2003,E03-1007
7,"[Steedman,Mark, Osborne,Miles, Sarkar,Anoop, C...","[Gender.male, Gender.male, Gender.male, Gender...",Bootstrapping Statistical Parsers From Small D...,EACL,2003,E03-1008
8,"[Clark,Alexander]",[Gender.male],Combining Distributional And Morphological Inf...,EACL,2003,E03-1009
9,"[Yasuda,Keiji, Sugaya,Fumiaki, Takezawa,Toshiy...","[Gender.male, Gender.male, Gender.male, Gender...",Automatic Evaluation For A Palpable Measure Of...,EACL,2003,E03-1010


In [342]:
my_new_unk2 = my_new_unk