In [1]:
import numpy as np
import pandas as pd
from google.colab import files

#LINK TO DATASET - https://github.com/dev7saxena/LIS875/blob/main/Data/875_week2_acm_articles.csv

uploaded = files.upload()

Saving 875_week2_acm_articles.csv to 875_week2_acm_articles.csv


In [2]:
data = pd.read_csv('875_week2_acm_articles.csv', header=0, keep_default_na=False).values.tolist()
data = [ (x[0], x[1], x[2], x[3], x[4]) for x in data ]

In [3]:
articles = { x[0]: { 'id': x[0], 'conf': x[1], 'year': x[2], 'title': x[3], 'authors': x[4].split(';') } for x in data }

## List vs. Set

If you need to constantly check if an item is in a collection of items, use set instead of list. The latter is much slower for the in operation.

Let's look at an example as follows. Here we divide the articles into those published in the SIGIR conference and those in other conferences. Then, we further extract the SIGIR articles whose authors did never publish in any other conferences.

TL;DR
* item in list: O(n)
* item in set: O(1)

In [4]:
# We can check if an item is within a collection of items using "in". This works for both list and set (and other collections).

1 in [4,2,3,1,5]

True

In [5]:
1 in {4,2,3,1,5}

True

In [None]:
# divide the articles into two parts: those published in SIGIR, and those in other conferences

articles_sigir = { id:articles[id] for id in articles if articles[id]['conf'].startswith('SIGIR') }
articles_other = { id:articles[id] for id in articles if not articles[id]['conf'].startswith('SIGIR') }

articles_sigir

In [7]:
print(len(articles_sigir), len(articles_other))

3379 274554


In [8]:
# Let's get the list of authors who published in conferences other than SIGIR.
# Let's store it as a set first.

names_exclude = set( [ au for x in articles_other.values() for au in x['authors'] ] )
len(names_exclude)

314585

In [None]:
# The following extract the list of SIGIR articles whose authors did never publish in other conferences.

[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_exclude ] ) == 0 ]

In [10]:
# Let's take a look at how fast it is.

import time

start = time.time()
[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_exclude ] ) == 0 ]
end = time.time()

print(end - start)

0.03374123573303223


In [None]:
import time

# Now let's use a list to store the set of names to exclude. It takes way much longer to finish.
names_excludelist = list(names_exclude)

start = time.time()
[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_excludelist ] ) == 0 ]
end = time.time()

print(end - start)

The above function will take a long time to run because the list data structure must loop through all IDs whereas the set datastructure knows that there will be no duplicate IDs present so it processes each ID only once.

## Function and lambda

In [12]:
# generate an author name abbreviation: first + middle name initial + last name

# a function can have 0 to many inputs and 0 to many outputs
def abbr(name):
    xs = name.split()
    abbr = ''.join([x[0].upper() for x in xs[0:-1]]) + ' ' + xs[-1].title() if len(xs)>=1 else name
    return abbr.strip()

In [13]:
abbr('W. Bruce Croft')

'WB Croft'

In [14]:
# we can use lambda for simple functions that can be finished within one line without too much logic control

# the following labmda defines a function that returns the apa abbreviated style of a reference
refabbr = lambda x : x['authors'][0].split()[-1] + ' et al. (' + str(x['year']) + ')'

In [None]:
articles[1533074]

In [None]:
refabbr(articles[1533074])

## In-class Exercise (5 min)

Write a function called get_conf_articles:
* input: the raw data (a list of tuples, where each tuple is an article)
* output: a dict where the key is conference and the value is a list of articles published in that conference


In [None]:

def get_conf_articles(data):
    allconfs = set( [ x[1] for x in data ] )
    results = { c:[] for c in allconfs }
    for x in data:
        results[x[1]].append(x)
    return results

get_conf_articles(data)["SIGIR '10"]

## Sorting a list

In [17]:
alist = [3,-2,-4,5]
alist.sort()
alist

[-4, -2, 3, 5]

In [18]:
alist.sort(reverse=True)
alist

[5, 3, -2, -4]

In [19]:
# you can provide a personalized function for sorting too
import math

alist.sort(key=lambda x:math.pow(x,2))  # the lambda function tells the sort function to sort by x squared
alist

[-2, 3, -4, 5]

In [None]:
data.sort(key=lambda x:x[0]) # sort the raw data (a list of tuples) by id
data[0:10]

In [None]:
data.sort(key=lambda x:x[4]) # sort the raw data (a list of tuples) by authors
data[0:10]

In [None]:
data.sort(reverse=True, key=lambda x:x[4]) # sort the raw data (a list of tuples) by authors
data[0:10]

## In-class Exercise (5 min)

Sort the raw data by the number of total authors in an article in a reverse order (from the highest number to the lowest one). Then, get the top 10 articles with the greatest number of authors.

In [None]:
# your solution here

data.sort(key=lambda x:len(x[4].split(';')), reverse=True)
[ (x, len(x[4].split(';'))) for x in data[0:10] ]

## Problem 1 (Week 2 Homework)

Create a function get_year_conf_arts:
* input: the raw data
* output: a dict where the key is year and the value is another dict (key is a conference, value is a list of articles published in that conference)



In [23]:
def get_year_conf_arts(data):
    # get the set of unique years in the dataset
    allyears = set([x[2] for x in data])
    # init the result dictionary
    results = {y :{} for y in allyears}

    # iterate through each article and store the results
    for x in data: # for each article in the dataset
            conf = x[1] # the article's conference
            year = x[2] # the article's year
            # init the list to store the articles published in one specific year and conference
            if conf not in results[year]:
                results[year][conf] = []
            # just append the article to the list
            results[year][conf].append(x)
    return results

In [24]:
year_conf_arts = get_year_conf_arts(data)

# let's take a look the list of articles published in year 2010 in the conference called "SIGIR '10"
year_conf_arts[2010]["SIGIR '10"]

[(1835510,
  "SIGIR '10",
  2010,
  'Incorporating post-click behaviors into a click model',
  'Feimin Zhong;Dong Wang;Gang Wang;Weizhu Chen;Yuchen Zhang;Zheng Chen;Haixun Wang'),
 (1835498,
  "SIGIR '10",
  2010,
  'Generalized syntactic and semantic models of query reformulation',
  'Amac Herdagdelen;Massimiliano Ciaramita;Daniel Mahler;Maria Holmqvist;Keith Hall;Stefan Riezler;Enrique Alfonseca'),
 (1835566,
  "SIGIR '10",
  2010,
  'iCollaborate: harvesting value from enterprise web usage',
  'Ajinkya Kale;Thomas Burris;Bhavesh Shah;T L Prasanna Venkatesan;Lakshmanan Velusamy;Manish Gupta;Melania Degerattu'),
 (1835502,
  "SIGIR '10",
  2010,
  'Temporally-aware algorithms for document classification',
  'Thiago Salles;Leonardo Rocha;Gisele L. Pappa;Fernando Mourão;Wagner Meira, Jr.;Marcos Gonçalves'),
 (1835466,
  "SIGIR '10",
  2010,
  'Caching search engine results over incremental indices',
  'Roi Blanco;Edward Bortnikov;Flavio Junqueira;Ronny Lempel;Luca Telloli;Hugo Zaragoza'

## Problem 2

Create a function count_num_coauthors:
* input: the raw data
* output: a dict where the key is an author (name), and the value is the total number of unique co-authors for that author across all articles.

For example, for the following article:

'The use of phrases and structured queries in information retrieval' by 'W. Bruce Croft;Howard R. Turtle;David D. Lewis'

'W. Bruce Croft' has two co-authors: 'Howard R. Turtle' and 'David D. Lewis'

In [25]:
def count_num_coauthors(data):
  au_coaus = {} # init a dictionary to store each author's co-authors
  for x in data: #for each article
    authors = x[4].split(';') #list of authors for the article
    for au1 in authors:
        for au2 in authors: #the other
            if au1 != au2: #if they are different
               if au1 not in au_coaus:
                  au_coaus[au1] = set()
               au_coaus[au1].add(au2)
    return {au:len(au_coaus[au]) for au in au_coaus}

In [26]:
num_coauthors = count_num_coauthors(data)

In [None]:
num_coauthors