In [2]:
import pandas as pd
import numpy as np
import json
import pprint # Pretty Printer - Imprime datos de estructuras de datos complejos de forma agradable.
from collections import Counter

In [1]:
#!pip install pandas 



In [3]:
with open('data/allcandidatenewssample.json') as f:
    candidatos = json.load(f)

In [4]:
len(candidatos)

60000

In [16]:
pprint.pprint(candidatos[2:4])

[{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-08-08 06:00:00',
  'domain': 'thehill.com',
  'panel_position': 1,
  'query': 'Bernie Sanders',
  'source': 'TheHill',
  'story_position': 7,
  'time': '15 hours ago',
  'title': "Sanders responds to de Blasio's invitation to play for the Nets | "
           'TheHill',
  'url': 'https://thehill.com/homenews/campaign/456579-sanders-responds-to-de-blasio-invitation-to-play-for-nets-ill-be-occupied'}]


Si nos fijamos en la salida del código anterior, tenemos 2 objetos candidato en `candidatos[2:4]`, 
el primero es {'date': '2019-09-11 18:00:00', 'reason': 'Not collected'}, este objeto tiene dos atributos.

1. _date_
2. _reason_

El atributo _date_ y el atributo _reason_.   
Mientras que el segundo objeto tiene nueve atributos. 
1. _date_
2. _domain_
3. _panel\_position_
4. _query_
5. _source_
6. _story\_position_
7. _time_
8. _title_
9. _url_

`{'date': '2019-08-08 06:00:00',
  'domain': 'thehill.com',
  'panel_position': 1,
  'query': 'Bernie Sanders',
  'source': 'TheHill',
  'story_position': 7,
  'time': '15 hours ago',
  'title': "Sanders responds to de Blasio's invitation to play for the Nets | "
           'TheHill',
  'url': 'https://thehill.com/homenews/campaign/456579-sanders-responds-to-de-blasio-invitation-to-play-for-nets-ill-be-occupied'}`


### XML

Un ejemplo de un candidato con dos atributos en formato XML se vería así

```xml
<candidato>
    <date>
        2019-09-11 18:00:00
    </date>
    <reason>
        Not collected
    </reason>
</candidato>
```

In [6]:
pprint.pprint(candidatos[0]['title'])

'Bloomberg cuts ties with company using prison inmates to make campaign calls'


In [8]:
candidatos[0]['title']

'Bloomberg cuts ties with company using prison inmates to make campaign calls'

In [13]:
Counter([len(atributos) for atributos in candidatos])

Counter({9: 57202, 2: 2382, 10: 416})

In [19]:
pprint.pprint(next(item for item in candidatos if len(item)==2))

{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'}


In [12]:
pprint.pprint(next(item for item in candidatos if len(item)>9))

{'category': 'Satire',
 'date': '2019-08-21 04:00:00',
 'domain': 'politics.theonion.com',
 'panel_position': 1,
 'query': 'John Hickenlooper',
 'source': 'Politics | The Onion',
 'story_position': 8,
 'time': '4 days ago',
 'title': '‘And Then There Were 23,’ Says Wayne Messam Crossing Out '
          'Hickenlooper Photo \n'
          'In Elaborate Grid Of Rivals',
 'url': 'https://politics.theonion.com/and-then-there-were-23-says-wayne-messam-crossing-ou-1837311060'}


Los candidatos que tienen 10 atributos, al parecer tienen un atributo adicional que es categoria. 
1. _categoria_
2. _date_
3. _domain_
4. _panel\_position_
5. _query_
6. _source_
7. _story\_position_
8. _time_
9. _title_
10. _url_

`{'category': 'Satire',
 'date': '2019-08-21 04:00:00',
 'domain': 'politics.theonion.com',
 'panel_position': 1,
 'query': 'John Hickenlooper',
 'source': 'Politics | The Onion',
 'story_position': 8,
 'time': '4 days ago',
 'title': '‘And Then There Were 23,’ Says Wayne Messam Crossing Out '
          'Hickenlooper Photo \n'
          'In Elaborate Grid Of Rivals',
 'url': 'https://politics.theonion.com/and-then-there-were-23-says-wayne-messam-crossing-ou-1837311060'}`


In [20]:
politicos = [atributos for atributos in candidatos if atributos["source"]=="Politico"]
len(politicos)

KeyError: 'source'

El problema ocurre porque no hemos filtrado nuestros datos y tenemos objetos candidatos que no tienen el atributo _source_. Entonces tenemos que eliminarlos de nuestra lista.

In [21]:
candidatos = [atributos for atributos in candidatos if len(atributos)>2]

In [22]:
len(candidatos)

57618

In [24]:
60000 - 2382 # Total original de registros de candidatos: 60000, total de registros con 2 atributos: 2382

57618

In [25]:
politicos = [atributos for atributos in candidatos if atributos["source"]=="Politico"]
len(politicos)

2732

In [26]:
pprint.pprint(politicos[0:5])

[{'date': '2019-05-18 18:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Marianne Williamson',
  'source': 'Politico',
  'story_position': 7,
  'time': '1 week ago',
  'title': 'Marianne Williamson reaches donor threshold for Dem debates',
  'url': 'https://www.politico.com/story/2019/05/09/marianne-williamson-2020-election-1315133'},
 {'date': '2018-12-27 06:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Julian Castro',
  'source': 'Politico',
  'story_position': 1,
  'time': '1 hour ago',
  'title': "O'Rourke and Castro on collision course in Texas",
  'url': 'https://www.politico.com/story/2018/12/27/orourke-julian-castro-collision-texas-election-1073720'},
 {'date': '2019-06-26 18:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Kamala Harris',
  'source': 'Politico',
  'story_position': 8,
  'time': '23 hours ago',
  'title': 'Democrats try out their Spanish on TV as they court Latino voters',
  'url'

Veamos que valores tiene el atributo _source_ en el total de registros.

In [28]:
sources = [atributos.get('source') for atributos in candidatos]
type(sources)


list

In [29]:
len(sources)

57618

In [30]:
sources[0:10] # mostrar los primeros 10 elementos

['NBC News',
 'Town & Country Magazine',
 'TheHill',
 'CNBC.com',
 'Fox News',
 'Winona Daily News',
 'Axios',
 'Hollywood Reporter',
 'Breitbart',
 'New York Post']

In [31]:
pprint.pprint(Counter(sources).most_common(10))

[('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('TheHill', 2383),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('The Hill', 1342),
 ('New York Post', 1275),
 ('Vox', 941)]


Podemos ver que existen dos valores para el mismo medio _The Hill_ está como __TheHill__ y  como __The Hill__ esto a futuro nos puede llevar a conclusiones erróneas. Tenemos que arreglar el nombre de la fuente.

In [32]:
for diccionario_candidatos in candidatos:
    diccionario_candidatos.update((k, "The Hill") for k, v in diccionario_candidatos.items()
                                  if k=="source" and v == "TheHill")

In [33]:
sources = [atributos.get('source') for atributos in candidatos]
pprint.pprint(Counter(sources).most_common(10))

[('The Hill', 3725),
 ('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('New York Post', 1275),
 ('Vox', 941),
 ('Breitbart', 799)]


## Ahora cargaremos nuestros datos en un pandas Dataframe

In [34]:
df = pd.DataFrame(candidatos)

In [35]:
df.head()

Unnamed: 0,title,url,source,time,date,query,story_position,panel_position,domain,category
0,Bloomberg cuts ties with company using prison ...,https://www.nbcnews.com/politics/2020-election...,NBC News,18 hours ago,2019-12-25 10:00:00,Michael Bloomberg,6,1,www.nbcnews.com,
1,Democratic Candidates React to Michael Bloombe...,https://www.townandcountrymag.com/society/poli...,Town & Country Magazine,18 hours ago,2019-11-09 08:00:00,Amy Klobuchar,3,1,www.townandcountrymag.com,
2,Sanders responds to de Blasio's invitation to ...,https://thehill.com/homenews/campaign/456579-s...,The Hill,15 hours ago,2019-08-08 06:00:00,Bernie Sanders,7,1,thehill.com,
3,From Andrew Yang's 'No Tie' look to Bernie Buc...,https://www.cnbc.com/2019/08/02/election-2020-...,CNBC.com,1 day ago,2019-08-04 10:00:00,Andrew Yang,2,1,www.cnbc.com,
4,Liberal billionaire Tom Steyer scolds Dems for...,https://www.foxnews.com/politics/liberal-billi...,Fox News,1 week ago,2019-05-28 00:00:00,Tom Steyer,5,2,www.foxnews.com,


In [36]:
df.dtypes

title             object
url               object
source            object
time              object
date              object
query             object
story_position     int64
panel_position    object
domain            object
category          object
dtype: object

In [37]:
df.rename(columns={'date':'storydate'}, inplace=True)

In [38]:
df.storydate = df.storydate.astype('datetime64[ns]')

In [39]:
df.head()

Unnamed: 0,title,url,source,time,storydate,query,story_position,panel_position,domain,category
0,Bloomberg cuts ties with company using prison ...,https://www.nbcnews.com/politics/2020-election...,NBC News,18 hours ago,2019-12-25 10:00:00,Michael Bloomberg,6,1,www.nbcnews.com,
1,Democratic Candidates React to Michael Bloombe...,https://www.townandcountrymag.com/society/poli...,Town & Country Magazine,18 hours ago,2019-11-09 08:00:00,Amy Klobuchar,3,1,www.townandcountrymag.com,
2,Sanders responds to de Blasio's invitation to ...,https://thehill.com/homenews/campaign/456579-s...,The Hill,15 hours ago,2019-08-08 06:00:00,Bernie Sanders,7,1,thehill.com,
3,From Andrew Yang's 'No Tie' look to Bernie Buc...,https://www.cnbc.com/2019/08/02/election-2020-...,CNBC.com,1 day ago,2019-08-04 10:00:00,Andrew Yang,2,1,www.cnbc.com,
4,Liberal billionaire Tom Steyer scolds Dems for...,https://www.foxnews.com/politics/liberal-billi...,Fox News,1 week ago,2019-05-28 00:00:00,Tom Steyer,5,2,www.foxnews.com,


In [40]:
df.dtypes

title                     object
url                       object
source                    object
time                      object
storydate         datetime64[ns]
query                     object
story_position             int64
panel_position            object
domain                    object
category                  object
dtype: object

In [41]:
df.shape

(57618, 10)

In [42]:
df.source.value_counts(sort=True).head(10)

The Hill               3725
Fox News               3530
CNN.com                2750
Politico               2732
The New York Times     1804
Washington Post        1770
Washington Examiner    1655
New York Post          1275
Vox                     941
Breitbart               799
Name: source, dtype: int64