# IETF Affiliations from Attendance Records

In [1]:
import bigbang.datasets.domains as domains
import bigbang.analysis.utils as utils
import bigbang.analysis.attendance as attendance

from ietfdata.datatracker     import *
from ietfdata.datatracker_ext import *
import pandas as pd
import matplotlib.pyplot as plt
import dataclasses

In [2]:
org_cats = pd.read_csv("../organizations/organization_categories.csv")

## Getting attendance records from datatracker

When attendees register for a meeting, the report their name, email address, and affiliation.

While this is noisy data (any human-entered data is!), we will use this information to associate domains with affilations. E.g. the email domain `apple.com` is associated with the company Apple.

We will also use this data to enrich our understanding of individual affiliations over time.

In [3]:
datatracker = DataTracker()

meetings = datatracker.meetings(meeting_type = datatracker.meeting_type(MeetingTypeURI('/api/v1/name/meetingtypename/ietf/')))
full_ietf_meetings = list(meetings)

In [4]:
ietf_meetings = []
for meeting in full_ietf_meetings:
    meetingd = dataclasses.asdict(meeting)
    meetingd['meeting_obj'] = meeting
    meetingd['num'] = int(meeting.number)
    ietf_meetings.append(meetingd)    

In [5]:
meetings_df = pd.DataFrame.from_records(ietf_meetings)

## Individual Affiliations

In [6]:
dt = DataTrackerExt() # initialize, for all meeting registration downloads

This will construct a dataframe of every attendee's registration at every specified meeting. (Downloading this data takes a while!)

In [7]:
ietf_meetings[110]['date']

datetime.datetime(2021, 7, 24, 0, 0)

In [8]:
meeting_attendees_df = pd.DataFrame()
for meeting in ietf_meetings:
    if meeting['num'] in [104,105,106,107,108,109]: # can filter here by the meetings to analyze
        registrations = dt.meeting_registrations(meeting=meeting['meeting_obj'])
        df = pd.DataFrame.from_records([dataclasses.asdict(x) for x in list(registrations)])
        df['num'] = meeting['num']
        df['date'] = meeting['date']
        df['domain'] = df['email'].apply(utils.extract_domain)
        full_name = df['first_name'] + " " + df['last_name']
        df['full_name'] = full_name
        meeting_attendees_df = meeting_attendees_df.append(df)

Filter by those who actually attended the meeting (checked in, didn't just register).

In [9]:
ind_affiliation = meeting_attendees_df[['full_name', 'affiliation', 'email', 'domain','date']]

This format of data -- with name, email, affiliation, and a timestamp -- can also be extracted from other IETF data, such as the RFC submission metadata. Later, we will use data of this form to infer _duration_ of affilation for IETF attendees.

In [10]:
ind_affiliation[:10]

Unnamed: 0,full_name,affiliation,email,domain,date
0,Thomas Pauly,Apple,tpauly@apple.com,apple.com,2019-03-23
1,Eric Kinnear,Apple,ekinnear@apple.com,apple.com,2019-03-23
2,Jordi Palet Martinez,Moremar,jordi.palet@consulintel.es,consulintel.es,2019-03-23
3,Heather Flanagan,RFC Editor,rse@rfc-editor.org,rfc-editor.org,2019-03-23
4,Kyle Rose,Akamai Technologies,krose@krose.org,krose.org,2019-03-23
5,Aaron Falk,Akamai,aaron.falk@gmail.com,gmail.com,2019-03-23
6,Russ Housley,"Vigil Security, LLC",housley@vigilsec.com,vigilsec.com,2019-03-23
7,Jason Livingood,Comcast // IASA 2.0 WG,Jason_Livingood@comcast.com,comcast.com,2019-03-23
8,Jeff Osborn,Internet Systems Consortium,jeff@isc.org,isc.org,2019-03-23
9,Mahesh Jethanandani,VMware,mjethanandani@gmail.com,gmail.com,2019-03-23


In [11]:
ind_affiliation['affiliation'].dropna().value_counts()

                                                          1071
Cisco                                                      166
Google                                                     151
Huawei                                                     149
Cisco Systems                                              140
                                                          ... 
ISAE-SUPAERO / TESA                                          1
Good API                                                     1
Towson University                                            1
NUS                                                          1
<a href='https://ghanarecruitment.com'>Recruitment</a>       1
Name: affiliation, Length: 1860, dtype: int64

## Matching affiliations with domains

In [12]:
affil_domain = ind_affiliation[['affiliation', 'domain', 'email']].pivot_table(
    index='affiliation',columns='domain', values='email', aggfunc = 'count')

Drop both known generic and known personal email domains.

In [13]:
ddf = domains.load_data()

generics = ddf[ddf['category'] == 'generic'].index
personals = ddf[ddf['category'] == 'personal'].index

In [14]:
generic_email_domains = set(affil_domain.columns).intersection(generics)
affil_domain.drop(generic_email_domains, axis = 1, inplace = True)

In [15]:
personal_email_domains = set(affil_domain.columns).intersection(personals)
affil_domain.drop(personal_email_domains, axis = 1, inplace = True)

In [16]:
ad_max = affil_domain.apply(lambda row: row.max(), axis=1)
ad_mean = affil_domain.apply(lambda row: row.dropna().mean(), axis=1)
ad_count = affil_domain.apply(lambda row: row.dropna().count(), axis=1)
ad_sum = affil_domain.apply(lambda row: row.dropna().sum(), axis=1)

ad_max_domain = affil_domain.apply(lambda row: row.idxmax(), axis=1)

## Add the columns *after* computing the statistics!
affil_domain['max'] = ad_max
affil_domain['mean'] = ad_mean
affil_domain['count'] = ad_count
affil_domain['sum'] = ad_sum
affil_domain['max_domain'] = ad_max_domain

In [17]:
ad_stats = affil_domain[['max_domain','max','count','mean','sum']].sort_values('max', ascending=False)

In [18]:
ad_stats[:100]

domain,max_domain,max,count,mean,sum
affiliation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Huawei,huawei.com,132.0,4,33.750000,135.0
Cisco,cisco.com,126.0,6,23.166667,139.0
Cisco Systems,cisco.com,124.0,3,42.666667,128.0
Google,google.com,112.0,9,14.444444,130.0
Ericsson,ericsson.com,103.0,8,15.625000,125.0
...,...,...,...,...,...
Telefonica,telefonica.com,7.0,1,7.000000,7.0
CENTR,centr.org,7.0,1,7.000000,7.0
IANA Services,iana.org,7.0,1,7.000000,7.0
ZTE Corporation,zte.com.cn,7.0,1,7.000000,7.0


In [19]:
ad_stats[:100].to_csv("affiliation_domain_stats.csv")

In [20]:
ad_stats['sum']

affiliation
Huawei                  135.0
Cisco                   139.0
Cisco Systems           128.0
Google                  130.0
Ericsson                125.0
                        ...  
windows                   0.0
www Safegnamain. com      0.0
xierqi                    0.0
ابراهيم                   0.0
恩典之前  讚美之裏                0.0
Name: sum, Length: 1860, dtype: float64

## Duration of affiliation

The current data we have for individual affiliations is "point" data, reflecting the affiliation of an individual on a particular date.

For many kinds of analysis, we may want to understand the full duration for which an individual has been associated with an organization. This requires an inference from the available data points to dates that are not explicitly represented in the data.

For now, we will use a rather simple form of inference: filling in any missing data from the last (temporally) known data point. And then if there's still missing data, infer backwards.

In [21]:
affil_dates = ind_affiliation.pivot_table(
    index="date",
    columns="full_name",
    values="affiliation",
    aggfunc="first"
).fillna(method='ffill').fillna(method='bfill')

In [22]:
top_attendees = ind_affiliation.groupby('full_name')['date'].count().sort_values(ascending=False)[:40].index

In [23]:
top_attendees

Index(['Ignas Bagdonas', 'Martin Duke', 'Gert Grammel', 'Roni Even',
       'Linda Dunbar', 'Toerless Eckert', 'Richard Barnes', 'Kohei Isobe',
       'Yutaka OIWA', 'Jonathan Lennox', 'Jim Reid', 'Ronald in 't Velt',
       'Gonzalo Camarillo', 'Paul Ebersman', 'Martin Thomson',
       'Marten Seemann', 'Glenn Deen', 'Martin Vigoureux', 'Paul Congdon',
       'Tianran Zhou', 'Ramesh Sivakolundu', 'Tero Kivinen', 'Markus Amend',
       'Ted Hardie', 'Chris Bowers', 'Tal Mizrahi', 'Takuya Miyasaka',
       'Chonggang Wang', 'Dominique Lazanski', 'Gorry Fairhurst',
       'Dino Farinacci', 'Tadahiko Ito', 'Suzanne Woolf', 'Susan Hares',
       'Suresh Krishnan', 'Matthew Ford', 'Dieter Sibold', 'Mark Nottingham',
       'Paul Hoffman', 'Marcus Ihlar'],
      dtype='object', name='full_name')

In [24]:
affil_dates[top_attendees]

full_name,Ignas Bagdonas,Martin Duke,Gert Grammel,Roni Even,Linda Dunbar,Toerless Eckert,Richard Barnes,Kohei Isobe,Yutaka OIWA,Jonathan Lennox,...,Dino Farinacci,Tadahiko Ito,Suzanne Woolf,Susan Hares,Suresh Krishnan,Matthew Ford,Dieter Sibold,Mark Nottingham,Paul Hoffman,Marcus Ihlar
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-23,Equinix,"F5 Networks, Inc.",Juniper,Huawei Technologies,Futurewei,Huawei USA,Cisco,SECOM,,Vidyo,...,,,,,Kaloom,Internet Society,PTB,,ICANN,Ericsson
2019-07-20,Equinix,"F5 Networks, Inc.",Juniper Networks,Toga Networks,Futurewei,Futurewei Technologies USA,Cisco,SECOM,AIST Japan / 産業技術総合研究所,8x8,...,lispers.net,SECOM,Public Interest Registry (.org),,Kaloom,Internet Society (ISOC),PTB,Fastly,ICANN,Ericsson
2019-11-16,Equinix,"F5 Networks, Inc.",Juniper,Toga Networks,Futurewei,Futurewei USA,Cisco,SECOM,AIST Japan,8x8 / Jitsi,...,,SECOM,Public Interest Registry (.org),,Kaloom,Internet Society,PTB,Fastly,ICANN,Ericsson
2020-03-21,Equinix,F5 Networks,Juniper,Toga Networks,,Futurewei USA,Cisco,SECOM,AIST Japan,8x8 / Jitsi,...,lispers.net,,Public Interest Registry (.org),,Kaloom,Internet Society,,Fastly,ICANN,Ericsson
2020-07-25,Equinix,"F5 Networks, Inc.",Juniper,,Futurewei,Futurewei USA,Cisco,SECOM,AIST Japan,,...,lispers.net,"SECOM CO., LTD.",Public Interest Registry (.ORG),Huawei,Kaloom,Internet Society,PTB,Fastly,ICANN,Ericsson
2020-11-14,Equinix,"F5 Networks, Inc.",Juniper,,Futurewei,Futurewei USA,Cisco,SECOM,AIST Japan,8x8 / Jitsi,...,lispers.net,SECOM,Public Interest Registry (PIR),Hickory Hill Consulting,Kaloom,Internet Society (ISOC),PTB,Fastly,ICANN,Ericsson


In [25]:
affil_dates[top_attendees].to_csv("inferred_affiliation_dates.csv")

### Linking to Organization lists

In [26]:
import bigbang.analysis.process as process

In [27]:
# drop subsidiary organizations
org_cats = org_cats[org_cats['subsidiary of / alias of'].isna()]

org_cats

Unnamed: 0,name,category,subsidiary of / alias of,stakeholdergroup,nationality,email domain names,Membership Organization,Wiki Page
0,3GPP,Standards Body,,Technical Community,,3gpp.org,,
1,450connect GmbH,,,,Germany,,ETSI,
2,4G Americas,,,,United States,,OTHER,
3,5G Americas,,,,United States,,OTHER,
4,5G Automotive Association,Association,,Business,Germany,,OTHER,
...,...,...,...,...,...,...,...,...
938,Zhejiang University,Research Institution,,Academia,China,,CCSA,
939,Zollkriminalamt (ZKA),,,Government,Germany,,ETSI,
940,ZONSON SAMRT AUTO CORPORATION,Consumer hardware and software vendor,,Business,China,,CCSA,
941,ZTE,Networking equipment vendor,,Business,China,"zte.com.cn, zte.com",CCSA,


Normalize/resolve the names from the IETF attedence records.

In [28]:
org_names = ad_stats['sum']
org_names = org_names.append(
    pd.Series(index = org_cats['name'], data = 1)
)
org_names = org_names.sort_values(ascending = False)
org_names = org_names[~org_names.index.duplicated(keep="first")]

In [29]:
ents = process.resolve_entities(
    org_names,
    process.containment_distance,
    threshold=.15
)

replacements = {}
for r in [{name: ent for name in ents[ent]} for ent in ents]:
    replacements.update(r)

In [30]:
ad_stats['norm_org'] = ad_stats.apply(lambda x : replacements[x.name], axis = 1)
org_cats['norm_org'] = org_cats.apply(lambda x : replacements[x['name']], axis = 1)

In [31]:
org_cats_plus = org_cats.join(ad_stats[['max_domain', 'norm_org']], on = 'norm_org', rsuffix="_ietf")

In [32]:
org_cats_plus_match = org_cats_plus[(~org_cats_plus['max_domain'].isna())].drop('norm_org_ietf',axis=1).rename({'max_domain' : 'max_domain_ietf'}, axis = 1)

In [33]:
org_cats_plus_match.to_csv("org_categories_matched_with_ietf_attendence_domains.csv")

In [34]:
org_cats_plus_match[:20]

Unnamed: 0,name,category,subsidiary of / alias of,stakeholdergroup,nationality,email domain names,Membership Organization,Wiki Page,norm_org,max_domain_ietf
24,Afilias,Internet Registry,,Business,United States,afilias.info,,,Afilias,afilias.info
27,Akamai Technologies,Content Distribution Network,,Business,United States,akamai.com,,,Akamai Technologies,akamai.com
30,Alibaba (China) Group. Ltd.,,,Business,China,alibaba-inc.com,CCSA,,Alibaba,alibaba-inc.com
37,Amazon Web Services Inc.,Cloud Provider,,Business,United States,amazon.com,ETSI,,Amazon Web Services,amazon.com
39,AMS,IETF secretariat,,Technical Community,United States,amsl.com,,,AMS,amsl.com
45,APNIC,Internet Registry,,Technical Community,,apnic.net,,,APNIC,apnic.net
46,Apple,Consumer hardware and software vendor,,Business,United States,apple.com,,,Apple,apple.com
63,Arista Networks,Cloud Provider,,Business,United States,arista.com,,,Arista Networks,arista.com
64,Arrcus,Cloud Provider,,Business,United States,arrcus.com,,,Arrcus,arrcus.com
65,ARM,Chipmaker,,Business,United States,arm.com,,,ARM,arm.com


## Export the graph of relations

Getting the affiliation data relations extracted from the attendance tables.

Final form: Three tables:
 - Name - Email, earliest and latest date
 - Name - Affiliation, earliest and latest date
 - Email - Affiliation, earliest and latest date

These can be combined into a tripartite graph, which should have a component for each affiliation entity.


In [35]:
meeting_range = [106,107,108]

In [36]:
a, b, c = attendance.name_email_affil_relations_from_IETF_attendance(meeting_range, threshold = 0.17)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [37]:
a

Unnamed: 0,full_name,affiliation,min_date,max_date
0,,,2019-11-16,2020-07-25
1,ABDEL RAHMAN NJI PANSO MOUNCHILI,,2019-11-16,2020-07-25
2,AKIRA MURAKAMI,Toshiba,2019-11-16,2020-07-25
3,ALBERT ESPINAL,ESPOL,2019-11-16,2020-07-25
4,ANDREW GROVER,Mozilla,2019-11-16,2020-07-25
...,...,...,...,...
2674,yusuke kagiwada,,2019-11-16,2020-07-25
2675,zhenbin Li,,2019-11-16,2020-07-25
2676,Éric Vyncke,Cisco,2019-11-16,2020-07-25
2677,Øyvind Rønningstad,Nordic Semiconductor,2019-11-16,2020-07-25


In [38]:
b

Unnamed: 0,email,affiliation,min_date,max_date
0,,,2019-11-16,2020-07-25
1,0bz3884c07731v.ietf@gmail.com,SECOM,2019-11-16,2020-07-25
2,115049098@qq.com,,2019-11-16,2020-07-25
3,1252363@tongji.edu.cn,Tongji University,2019-11-16,2020-07-25
4,13910628561@139.com,,2019-11-16,2020-07-25
...,...,...,...,...
2723,zuni.wang@huawei.com,,2019-11-16,2020-07-25
2724,zzhang1@futurewei.com,Futurewei,2019-11-16,2020-07-25
2725,zzhang@juniper.net,Juniper Networks,2019-11-16,2020-07-25
2726,zzhang_ietf@hotmail.com,,2019-11-16,2020-07-25


In [39]:
b['affiliation'].value_counts()['cisco']

5

In [40]:
c

Unnamed: 0,full_name,email,min_date,max_date
0,,I,2019-11-16,2020-03-21
1,,haqoleho@mailinator.com,2020-07-25,2020-07-25
2,ABDEL RAHMAN NJI PANSO MOUNCHILI,morellaorny@gmail.com,2019-11-16,2020-07-25
3,AKIRA MURAKAMI,akira4.murakami@toshiba.co.jp,2019-11-16,2020-07-25
4,ALBERT ESPINAL,aespinal@espol.edu.ec,2019-11-16,2020-07-25
...,...,...,...,...
2587,yusuke kagiwada,block.rxckin.beats@gmail.com,2019-11-16,2020-07-25
2588,zhenbin Li,robinli314@163.com,2019-11-16,2020-07-25
2589,Éric Vyncke,evyncke@cisco.com,2019-11-16,2020-07-25
2590,Øyvind Rønningstad,oyvind.ronningstad@nordicsemi.no,2019-11-16,2020-07-25


### Match to a mailing list

In [41]:
from bigbang.archive import Archive
arx = Archive("httpbisa")

From the archive data: From -> email address, Date

Match with table B: email,. min_date, max_date, to get Affiliation

Add Affiliation to the archive data.

In [42]:
arx.add_affiliation(b)

In [43]:
arx.data[['From','Date','affiliation']].dropna()

Unnamed: 0_level_0,From,Date,affiliation
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<DM6PR22MB20103E911E1FE1007270CEA9DA4D0@DM6PR22MB2010.namprd22.prod.outlook.com>,Mike Bishop <mbishop@evequefou.be>,2019-11-18 08:02:15+00:00,Akamai Technologies
<CALGR9obJLrTufZe+UGtzNmucyZa4oQCcOeuY+Aq9SwM4w0kaJg@mail.gmail.com>,Lucas Pardue <lucaspardue.24.7@gmail.com>,2019-11-18 18:55:17+00:00,Cloudflare
<C573DE86-39B5-48AB-9C21-0717F4EBEB59@mnot.net>,Mark Nottingham <mnot@mnot.net>,2019-11-19 07:43:15+00:00,Fastly
<CAKC-DJiuE0Gmt5kMz8UJbZ625hdeMZLih3CCg1FQKx92D79oLw@mail.gmail.com>,Erik Nygren <erik+ietf@nygren.org>,2019-11-20 02:55:26+00:00,Akamai Technologies
<371380E9-7204-41EC-8F32-653E9B5272D8@iii.ca>,Cullen Jennings <fluffy@iii.ca>,2019-11-20 09:39:43+00:00,cisco
...,...,...,...
<CAAZdMae-jZVfv98jC3ru51uk6H6TZuaQ+P=PJW-BZ9uEKVbQCA@mail.gmail.com>,Victor Vasiliev <vasilvv@google.com>,2020-07-21 20:22:30+00:00,Google
<CALGR9oa1aW90VmrGO8wAsBWAq2un5bHY67m-HpaR_8LpNrD_3g@mail.gmail.com>,Lucas Pardue <lucaspardue.24.7@gmail.com>,2020-07-21 23:43:46+00:00,Cloudflare
<010001737c434b23-44eb8c59-f98a-4c3d-8cf3-b991994f2e23-000000@email.amazonses.com>,Kent Watsen <kent+ietf@watsen.net>,2020-07-23 15:20:51+00:00,Watsen Networks
<CAHbrMsDNwZN64Y7Tfp0e0JQOSfArk5LeUTC8JqBeatiBVFJN0g@mail.gmail.com>,Ben Schwartz <bemasc@google.com>,2020-07-23 18:06:21+00:00,Google / Jigsaw
