## The raw data is a list of tuples:
* each tuple represents an article's id, conference, year, title, authors

In [None]:
import numpy as np
import pandas as pd
from google.colab import files

#LINK TO DATASET - https://github.com/dev7saxena/LIS875/blob/main/Data/875_week2_acm_articles.csv

uploaded = files.upload()

Saving 875_week2_acm_articles.csv to 875_week2_acm_articles.csv


In [None]:
data = pd.read_csv('875_week2_acm_articles.csv', header=0, keep_default_na=False).values.tolist()

In [None]:
data = [ (x[0], x[1], x[2], x[3], x[4]) for x in data ]

In [None]:
# The raw data is stored as a list of tuples.

type(data)

list

In [None]:
# it includes 277,933 articles' metadata in total

len(data)

277933

In [None]:
# the first article's information: id, conference, year, title, authors (seperated using ;)

data[3]

(1526756,
 "WWW '09",
 2009,
 'Visual diversification of image search results',
 'Reinier H. van Leuken;Lluis Garcia;Ximena Olivares;Roelof van Zwol')

## Create a dict:
* key: each article's id
* value: an article (stored also as a dict)

In [None]:
# it's more informative if we represent an article's information as a dict (instead of a list), e.g.:

{ 'id': data[0][0], 'conf': data[0][1], 'year': data[0][2], 'title': data[0][3], 'authors': data[0][4] }

In [None]:
# let's do it better by splitting the author names into a list

{ 'id': data[0][0], 'conf': data[0][1], 'year': data[0][2], 'title': data[0][3], 'authors': data[0][4].split(';') }

In [None]:
# okay, now let's transform the whole data list into a list of dicts

[ { 'id': x[0], 'conf': x[1], 'year': x[2], 'title': x[3], 'authors': x[4].split(';') } for x in data ]

In [None]:
# now let's make it better by organizing the articles from a data list into a dict, where the key is each article's id, and the value is a dict representing the article

articles = { x[0]: { 'id': x[0], 'conf': x[1], 'year': x[2], 'title': x[3], 'authors': x[4].split(';') } for x in data }
articles

In [None]:
# now we can access each article from the dict by the article's id (instead of using list index)

articles[1531840]

In [None]:
# now we can access an article's information using informative key names (instead of using list index)

articles[1531840]['title']

In [None]:
# now we can access an article's information using informative key names (instead of using list index)

articles[1531840]['authors']

## Create a dict:
* key: a conference
* value: a list of articles published in that conference

In [None]:
# get all the conferences

[x['conf'] for x in articles.values()]

In [None]:
# let's remove duplicate ones by resolving them into a set

set([x['conf'] for x in articles.values()])

In [None]:
# 5977 unique conferences in total

len(set([x['conf'] for x in articles.values()]))

In [None]:
# now let's write a quick list comprehension ... 

{ conf:[ x for x in articles.values() if x['conf']==conf ] for conf in set([x['conf'] for x in articles.values()]) }

# wait ... why it takes so long to finish ????? omg!!!!
# let's stop running the loop and think about what's wrong

In [None]:
# now let's look into the loops to figure out the reason ...
# the above list comprehension is equivalent to the following ...

tmp = {}

for conf in set([x['conf'] for x in articles.values()]): # 5977 items
    tmp[conf] = []
    for x in articles.values(): # 277933 items
        if x['conf']==conf:
            tmp[conf].append(x)

# 5977 * 277933 = 1,661,205,541 
# that's why it takes so long to finish ...

In [None]:
# a better solution

conf_articles = { conf:[] for conf in set([x['conf'] for x in articles.values()]) }  # 5977 items

for x in articles.values(): # 277933 items
    conf_articles[x['conf']].append(x)

In [None]:
# now let's count how many articles were published in each conference

[(conf, len(conf_articles[conf]))for conf in conf_articles]

## In-class Exercise (10 min)

Create a dict:
* key: an author (let's not consider the case that two different authors have the same name)
* value: a list of articles the author has published

Then, count the number of articles each author has published.

Get the top 50 most productive authors (by the number of articles they have published).

In [None]:
# store the results into author_articles

author_articles = {}

In [None]:
# step 1: get the unique list of authors
allauthors = set( [ au for x in articles.values() for au in x['authors'] ] )
len(allauthors)

In [None]:
# step 2: scan the data once and create the dict
author_articles = { au:[] for au in allauthors }
for x in articles.values():
    for au in x['authors']:
        author_articles[au].append(x)


In [None]:
# counting the number of articles each author has published

numarts = [ (au, len(author_articles[au])) for au in author_articles ]
numarts.sort(key=lambda x:x[1], reverse=True)
numarts[0:50]

## In-class Exercise (10 min)

Now let's create an even more complex dict:
* key: a year
* value: a dict that the key is an author and the value is the set of conferences the author has published at least one article in

Then, count the 50 authors who have published in the greatest number of conferences in 2010.

In [None]:
# your solution here

allyears = set( [ x['year'] for x in articles.values() ] )

year_author_confs = { y:{} for y in allyears }
for x in articles.values():
    for au in x['authors']:
        if au not in year_author_confs[x['year']]:
            year_author_confs[x['year']][au] = set()
        year_author_confs[x['year']][au].add(x['conf'])

stats2010 = [ (au, len(year_author_confs[2010][au])) for au in year_author_confs[2010] ]
stats2010.sort(key=lambda x:x[1], reverse=True)
stats2010[0:50]

## List vs. Set

If you need to constantly check if an item is in a collection of items, use set instead of list. The latter is much slower for the in operation.

Let's look at an example as follows. Here we divide the articles into those published in the SIGIR conference and those in other conferences. Then, we further extract the SIGIR articles whose authors did never publish in any other conferences.

TL;DR
* item in list: O(n)
* item in set: O(1)

In [None]:
# We can check if an item is within a collection of items using "in". This works for both list and set (and other collections).

1 in [4,2,3,1,5]

In [None]:
1 in {4,2,3,1,5}

In [None]:
# divide the articles into two parts: those published in SIGIR, and those in other conferences

articles_sigir = { id:articles[id] for id in articles if articles[id]['conf'].startswith('SIGIR') }
articles_other = { id:articles[id] for id in articles if not articles[id]['conf'].startswith('SIGIR') }

articles_sigir

In [None]:
print(len(articles_sigir), len(articles_other))

In [None]:
# Let's get the list of authors who published in conferences other than SIGIR.
# Let's store it as a set first.

names_exclude = set( [ au for x in articles_other.values() for au in x['authors'] ] )
len(names_exclude)

In [None]:
# The following extract the list of SIGIR articles whose authors did never publish in other conferences.

[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_exclude ] ) == 0 ]

In [None]:
# Let's take a look at how fast it is.

import time

start = time.time()
[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_exclude ] ) == 0 ]
end = time.time()

print(end - start)

In [None]:
import time

# Now let's use a list to store the set of names to exclude. It takes way much longer to finish.
names_excludelist = list(names_exclude)

start = time.time()
[ x for x in articles_sigir.values() if np.sum( [ 1 for au in x['authors'] if au in names_excludelist ] ) == 0 ]
end = time.time()

print(end - start)

## Function and lambda

In [None]:
# generate an author name abbreviation: first + middle name initial + last name

# a function can have 0 to many inputs and 0 to many outputs
def abbr(name):
    xs = name.split()
    abbr = ''.join([x[0].upper() for x in xs[0:-1]]) + ' ' + xs[-1].title() if len(xs)>=1 else name
    return abbr.strip()

In [None]:
abbr('W. Bruce Croft')

In [None]:
# we can use lambda for simple functions that can be finished within one line without too much logic control

# the following labmda defines a function that returns the apa abbreviated style of a reference
refabbr = lambda x : x['authors'][0].split()[-1] + ' et al. (' + str(x['year']) + ')'

In [None]:
articles[1533074]

In [None]:
refabbr(articles[1533074])

## In-class Exercise (5 min)

Write a function called get_conf_articles:
* input: the raw data (a list of tuples, where each tuple is an article)
* output: a dict where the key is conference and the value is a list of articles published in that conference


In [None]:

def get_conf_articles(data):
    allconfs = set( [ x[1] for x in data ] )
    results = { c:[] for c in allconfs }
    for x in data:
        results[x[1]].append(x)
    return results

get_conf_articles(data)["SIGIR '10"]

## Sorting a list

In [None]:
alist = [3,-2,-4,5]
alist.sort()
alist

In [None]:
alist.sort(reverse=True)
alist

In [None]:
# you can provide a personalized function for sorting too
import math

alist.sort(key=lambda x:math.pow(x,2))  # the lambda function tells the sort function to sort by x squared
alist

In [None]:
data.sort(key=lambda x:x[0]) # sort the raw data (a list of tuples) by id
data[0:10]

In [None]:
data.sort(key=lambda x:x[4]) # sort the raw data (a list of tuples) by authors
data[0:10]

In [None]:
data.sort(reverse=True, key=lambda x:x[4]) # sort the raw data (a list of tuples) by authors
data[0:10]

## In-class Exercise (5 min)

Sort the raw data by the number of total authors in an article in a reverse order (from the highest number to the lowest one). Then, get the top 10 articles with the greatest number of authors.

In [None]:
# your solution here

data.sort(key=lambda x:len(x[4].split(';')), reverse=True)
[ (x, len(x[4].split(';'))) for x in data[0:10] ]