# Data Visualization

Load institute information from [us_institutes.tsv](https://github.com/elitcloud/nlp-ranking/blob/master/dat/us_institutes.tsv):

In [12]:
from types import SimpleNamespace

def load_institutes(institute_file):
    fin = open(institute_file)
    d = {}

    for line in fin:
        l = list(map(str.strip, line.split('\t')))
        d[l[1]] = SimpleNamespace(name=l[0], url=l[1], city=l[2], state=l[3], score=0.0)

    fin.close()
    return d

In [13]:
INSTITUTE_FILE = '/Users/jdchoi/Git/nlp-ranking/dat/us_institutes.tsv'
inst_map = load_institutes(INSTITUTE_FILE)
print(len(inst_map))

1925


Load (institute, score) pairs from [email_map.tsv](https://github.com/elitcloud/nlp-ranking/blob/master/dat/email_map.tsv):

### Exercise

```python
def load_scores(email_file):
    """
    :param email_file: email_map.tsv. 
    :return: a dictionary whose key is the publication ID and the value is the list of (domain, score) pairs.
    """
```

In [14]:
def load_scores(email_file):
    fin = open(email_file)
    d = {}

    for line in fin:
        l = list(map(str.strip, line.split('\t')))
        if l[-1] != '_':
            scores = []
            d[l[0]] = scores
            for s in l[-1].split(';'):
                t = s.split(':')
                scores.append((t[0], float(t[1])))

    fin.close()
    return d

In [15]:
EMAIL_FILE = '/Users/jdchoi/Git/nlp-ranking/dat/email_map.tsv'
score_map = load_scores(EMAIL_FILE)
print(len(score_map))

15412


Given `inst_map` and `score_map`, measure the total score of each institute:

### Exercise

```python
def measure_scores(inst_map, score_map):
    """
    :param inst_map: the output of load_institutes().
    :param score_map: the output of load_scores().
    :return: a list of institute namespaces where the score field contains the total score of that institute from their publications. 
    """
```

In [16]:
def measure_scores(inst_map, score_map):
    for pub_id, v in score_map.items():
        for domain, score in v:
            d = domain.split('.')
            for i in range(len(d)-2, -1, -1):
                uid = '.'.join(d[i:])
                if uid in inst_map:
                    inst_map[uid].score += score
                    break

    return [inst for url, inst in inst_map.items() if inst.score > 0.0]

In [17]:
inst_scores = measure_scores(inst_map, score_map)
print(len(inst_scores))

194


Measure scores by states:

### Exercise

```python
def measure_state_scores(inst_scores):
    """
    :param inst_scores: the output of measure_scores().
    :return: a dictionary where the key is the State ID and the value is the total score of that state w.r.t. their publications.
    """
```

In [18]:
def measure_state_scores(inst_scores):
    """
    :param inst_scores: the output of measure_scores().
    :return: a dictionary where the key is the State ID and the value is the total score of that state w.r.t. their publications.
    """
    states = {}

    for inst in inst_scores:
        states[inst.state] = states.get(inst.state, 0) + inst.score

    return states

In [20]:
state_scores = measure_state_scores(inst_scores)
for k, v in sorted(state_scores.items()):
    print("['%s', %f]," % (k, v))

['AL', 14.658440],
['AZ', 32.661902],
['CA', 432.101051],
['CO', 89.017857],
['CT', 13.933334],
['DC', 54.705303],
['DE', 25.225000],
['FL', 17.666664],
['GA', 40.733333],
['HI', 2.333333],
['IA', 3.650000],
['ID', 1.333333],
['IL', 186.750252],
['IN', 64.154763],
['KS', 0.500000],
['KY', 1.000000],
['LA', 2.533334],
['MA', 258.033364],
['MD', 268.735340],
['MI', 70.363095],
['MN', 24.151189],
['MO', 2.999999],
['NC', 43.504762],
['ND', 0.500000],
['NE', 2.833333],
['NH', 5.083332],
['NJ', 33.358330],
['NM', 2.000000],
['NY', 471.839777],
['OH', 52.652378],
['OR', 21.564718],
['PA', 610.323793],
['RI', 24.033333],
['SC', 0.500000],
['TN', 13.638889],
['TX', 219.466294],
['UT', 45.733333],
['VA', 17.959920],
['VT', 2.500000],
['WA', 151.955949],
['WI', 23.100000],


In [None]:
Paste the output of `measure_state_scores` to ''