# Data Visualization

Load institute information from [us_institutes.tsv](https://github.com/elitcloud/nlp-ranking/blob/master/dat/us_institutes.tsv):

In [12]:
from types import SimpleNamespace

def load_institutes(institute_file):
    fin = open(institute_file)
    d = {}

    for line in fin:
        l = list(map(str.strip, line.split('\t')))
        d[l[1]] = SimpleNamespace(name=l[0], url=l[1], city=l[2], state=l[3], score=0.0)

    fin.close()
    return d

In [13]:
INSTITUTE_FILE = '/Users/jdchoi/Git/nlp-ranking/dat/us_institutes.tsv'
inst_map = load_institutes(INSTITUTE_FILE)
print(len(inst_map))

1925


Load (institute, score) pairs from [email_map.tsv](https://github.com/elitcloud/nlp-ranking/blob/master/dat/email_map.tsv):

### Exercise

```python
def load_scores(email_file):
    """
    :param email_file: email_map.tsv. 
    :return: a dictionary whose key is the publication ID and the value is the list of (domain, score) pairs.
    """
```

In [14]:
def load_scores(email_file):
    fin = open(email_file)
    d = {}

    for line in fin:
        l = list(map(str.strip, line.split('\t')))
        if l[-1] != '_':
            scores = []
            d[l[0]] = scores
            for s in l[-1].split(';'):
                t = s.split(':')
                scores.append((t[0], float(t[1])))

    fin.close()
    return d

In [15]:
EMAIL_FILE = '/Users/jdchoi/Git/nlp-ranking/dat/email_map.tsv'
score_map = load_scores(EMAIL_FILE)
print(len(score_map))

15412


Given `inst_map` and `score_map`, measure the total score of each institute:

### Exercise

```python
def measure_scores(inst_map, score_map):
    """
    :param inst_map: the output of load_institutes().
    :param score_map: the output of load_scores().
    :return: a list of institute namespaces where the score field contains the total score of that institute from their publications. 
    """
```

In [None]:
def measure_scores(inst_map, score_map):
    for pid, v in score_map.items():
        for domain, score in v:
            if domain in inst_map:
                inst_map[domain].score += score
return inst_map.values()                
            
        

In [16]:
def measure_scores(inst_map, score_map):
    for pub_id, v in score_map.items():
        for domain, score in v:
            d = domain.split('.')
            for i in range(len(d)-2, -1, -1):
                uid = '.'.join(d[i:])
                if uid in inst_map:
                    inst_map[uid].score += score
                    break

    return [inst for url, inst in inst_map.items() if inst.score > 0.0]

In [17]:
inst_scores = measure_scores(inst_map, score_map)
print(len(inst_scores))

194


Measure scores by states:

### Exercise

```python
def measure_state_scores(inst_scores):
    """
    :param inst_scores: the output of measure_scores().
    :return: a dictionary where the key is the State ID and the value is the total score of that state w.r.t. their publications.
    """
```

In [18]:
def measure_state_scores(inst_scores):
    """
    :param inst_scores: the output of measure_scores().
    :return: a dictionary where the key is the State ID and the value is the total score of that state w.r.t. their publications.
    """
    states = {}

    for inst in inst_scores:
        states[inst.state] = states.get(inst.state, 0) + inst.score

    return states

In [20]:
state_scores = measure_state_scores(inst_scores)
for k, v in sorted(state_scores.items()):
    print("['%s', %f]," % (k, v))

['AL', 14.658440],
['AZ', 32.661902],
['CA', 432.101051],
['CO', 89.017857],
['CT', 13.933334],
['DC', 54.705303],
['DE', 25.225000],
['FL', 17.666664],
['GA', 40.733333],
['HI', 2.333333],
['IA', 3.650000],
['ID', 1.333333],
['IL', 186.750252],
['IN', 64.154763],
['KS', 0.500000],
['KY', 1.000000],
['LA', 2.533334],
['MA', 258.033364],
['MD', 268.735340],
['MI', 70.363095],
['MN', 24.151189],
['MO', 2.999999],
['NC', 43.504762],
['ND', 0.500000],
['NE', 2.833333],
['NH', 5.083332],
['NJ', 33.358330],
['NM', 2.000000],
['NY', 471.839777],
['OH', 52.652378],
['OR', 21.564718],
['PA', 610.323793],
['RI', 24.033333],
['SC', 0.500000],
['TN', 13.638889],
['TX', 219.466294],
['UT', 45.733333],
['VA', 17.959920],
['VT', 2.500000],
['WA', 151.955949],
['WI', 23.100000],


Paste the output of `measure_state_scores` to [geochart.html](https://github.com/emory-courses/data-science/blob/master/src/geochart.html)
```html
<html>
  <head>
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">
      google.charts.load('current', {
        'packages':['geochart'],
        // Note: you will need to get a mapsApiKey for your project.
        // See: https://developers.google.com/chart/interactive/docs/basic_load_libs#load-settings
        'mapsApiKey': 'AIzaSyD-9tSrke72PouQMnMX-a7eZSW0jkFMBWY'
      });
      google.charts.setOnLoadCallback(drawRegionsMap);

      function drawRegionsMap() {
        var data = google.visualization.arrayToDataTable([
        ['State', 'Publication Scores']
        ]);

        var options = {width: 900, height: 500, region: "US", resolution: "provinces"};
        var chart = new google.visualization.GeoChart(document.getElementById('regions_div'));
        chart.draw(data, options);
      }
    </script>
  </head>
  <body>
    <div id="regions_div"></div>
  </body>
</html>
```

### Exercise

```python
def measure_scores(inst_map, score_map, start_year, end_year):
    """
    :param inst_map: the output of load_institutes().
    :param score_map: the output of load_scores().
    :param start_year: staring year of the publications.
    :param end_year: ending year of the publications.
    :return: a list of institute namespaces where the score field contains the total score of that institute from their publications.
    """
```

Measure the score of each institute within a certain state:

### Exercise

```python
def measure_inst_scores_by_state(inst_scores, state):
    """
    :param inst_scores: the output of measure_scores(). 
    :param state: the state ID (e.g., GA).
    :return: a dictionary where the key is the institute name and the value is the total score of that institute. 
    """
```

In [23]:
def measure_inst_scores_by_state(inst_scores, state):
    """
    :param inst_scores: the output of measure_scores().
    :param state: the state ID (e.g., GA).
    :return: a dictionary where the key is the institute name and the value is the total score of that institute.
    """

    insts = {}

    for inst in inst_scores:
        if state == inst.state:
            insts[inst.name] = insts.get(inst.name, 0) + inst.score

    return insts

In [25]:
scores = measure_inst_scores_by_state(inst_scores, 'CA')
for k, v in sorted(scores.items(), key=lambda t: float(t[1]), reverse=True):
    print("['%s', %f]," % (k, v))

['Stanford University', 177.527373],
['Berkeley University of California', 102.651187],
['University of Southern California', 55.055555],
['University of California Santa Cruz', 32.149997],
['University of California San Diego', 19.594047],
['University of California Los Angeles', 17.249084],
['University of California Santa Barbara', 7.571429],
['University of California Irvine', 7.150000],
['Pomona College', 4.500000],
['California Institute of Technology', 1.499999],
['San Diego State University', 1.333333],
['California State University Stanislaus', 1.000000],
['Harvey Mudd College', 1.000000],
['San Francisco State University', 1.000000],
['University of San Francisco', 1.000000],
['University of California Davis', 0.783333],
['Naval Postgraduate School', 0.500000],
['University of California San Francisco', 0.285714],
['California State University Fullerton', 0.250000],


Paste the output of `measure_inst_scores_by_state` to [piechart.html](https://github.com/emory-courses/data-science/blob/master/src/piechart.html)
```html
<html>
  <head>
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">
      google.charts.load('current', {'packages':['corechart']});
      google.charts.setOnLoadCallback(drawChart);

      function drawChart() {

        var data = google.visualization.arrayToDataTable([
          ['Institute', 'Score']
          ]);

        var options = {
          title: 'Publication Scores'
        };

        var chart = new google.visualization.PieChart(document.getElementById('piechart'));

        chart.draw(data, options);
      }
    </script>
  </head>
  <body>
    <div id="piechart" style="width: 900px; height: 500px;"></div>
  </body>
</html>
```