# Pandas Input/output

Can read: `.csv`, `.json`, `.xlsx`, `.parquet`, `.db`, `.hdf`, ...


https://pandas.pydata.org/pandas-docs/stable/reference/io.html

In [1]:
import pandas as pd

### Our Data

- `attacks.csv` from Kaggle
- `github_pulls.json` from GitHub API
- `2023_Accidentalidad.xlsx` from Datos Abiertos Ayuntamiento de Madrid
- `test.parquet` from Kaggle

__.CSV Files__

In [8]:
# el with abre y cierra el archivo

with open('datasets/barrios.csv') as f:  # f es un alias
    
    lines = f.readlines()

In [10]:
lines[:5]

['cod_distrito\tcod_barrio\tbarrio\n',
 '1\t1\tPALACIO\n',
 '1\t2\tEMBAJADORES\n',
 '1\t3\tCORTES\n',
 '1\t4\tJUSTICIA\n']

In [65]:
df_csv = pd.read_csv('datasets/barrios.csv')

df_csv.head()

Unnamed: 0,cod_distrito\tcod_barrio\tbarrio
0,1\t1\tPALACIO
1,1\t2\tEMBAJADORES
2,1\t3\tCORTES
3,1\t4\tJUSTICIA
4,1\t5\tUNIVERSIDAD


In [67]:
df_csv.iloc[0, 0]

'1\t1\tPALACIO'

In [68]:
print(df_csv.iloc[0, 0])

1	1	PALACIO


In [62]:
print('ab')

ab


In [14]:
print('a\tb')

a	b


In [63]:
print('a\nb')

a
b


In [64]:
print(r'a\tb')  # r de raw, asi es literal lo que escribo

a\tb


In [15]:
df_csv = pd.read_csv('datasets/barrios.csv', sep='\t')

df_csv.head()

Unnamed: 0,cod_distrito,cod_barrio,barrio
0,1,1,PALACIO
1,1,2,EMBAJADORES
2,1,3,CORTES
3,1,4,JUSTICIA
4,1,5,UNIVERSIDAD


In [16]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cod_distrito  55 non-null     int64 
 1   cod_barrio    55 non-null     int64 
 2   barrio        55 non-null     object
dtypes: int64(2), object(1)
memory usage: 1.4+ KB


In [17]:
df_csv = pd.read_csv('datasets/attacks.csv')

df_csv.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 696: invalid start byte

In [19]:
codecs = pd.read_csv('datasets/pandas_codecs.csv')

codecs.head()

Unnamed: 0,Codec,Aliases,Languages
0,ascii,"646, us-ascii",English
1,big5,"big5-tw, csbig5",Traditional Chinese
2,big5hkscs,"big5-hkscs, hkscs",Traditional Chinese
3,cp037,"IBM037, IBM039",English
4,cp273,"273, IBM273, csIBM273",German


In [22]:
codecs['Codec'].size

109

In [25]:
new_codecs = []

for c in codecs['Codec']:
    
    try:
        
        df_csv = pd.read_csv('datasets/attacks.csv', encoding=c)
        
        new_codecs.append(c)
        
    except:
        
        continue   # pasa al siguiente paso del bucle

In [26]:
new_codecs

['cp037',
 'cp273',
 'cp437',
 'cp500',
 'cp720',
 'cp737',
 'cp775',
 'cp850',
 'cp852',
 'cp855',
 'cp858',
 'cp860',
 'cp861',
 'cp862',
 'cp863',
 'cp864',
 'cp865',
 'cp866',
 'cp875',
 'cp1006',
 'cp1026',
 'cp1125',
 'cp1140',
 'cp1250',
 'cp1251',
 'cp1252',
 'cp1254',
 'cp1256',
 'latin_1',
 'iso8859_2',
 'iso8859_4',
 'iso8859_5',
 'iso8859_7',
 'iso8859_9',
 'iso8859_10',
 'iso8859_13',
 'iso8859_14',
 'iso8859_15',
 'iso8859_16',
 'koi8_r',
 'koi8_u',
 'kz1048',
 'mac_cyrillic',
 'mac_greek',
 'mac_iceland',
 'mac_latin2',
 'mac_roman',
 'mac_turkish',
 'ptcp154']

In [27]:
df_csv = pd.read_csv('datasets/attacks.csv', encoding='iso8859_9')

df_csv.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [28]:
df_csv.describe()

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0


In [29]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [40]:
df_csv.to_csv('datasets/tiburones.csv', index=False)

---

__.JSON Files__

In [30]:
import json

with open('datasets/github_pulls.json') as archivo:
    
    json = json.load(archivo)

In [33]:
json[0].keys()

dict_keys(['url', 'id', 'node_id', 'html_url', 'diff_url', 'patch_url', 'issue_url', 'number', 'state', 'locked', 'title', 'user', 'body', 'created_at', 'updated_at', 'closed_at', 'merged_at', 'merge_commit_sha', 'assignee', 'assignees', 'requested_reviewers', 'requested_teams', 'labels', 'milestone', 'draft', 'commits_url', 'review_comments_url', 'review_comment_url', 'comments_url', 'statuses_url', 'head', 'base', '_links', 'author_association', 'auto_merge', 'active_lock_reason'])

In [34]:
df_json = pd.read_json('datasets/github_pulls.json')

df_json.head()

Unnamed: 0,url,id,node_id,html_url,diff_url,patch_url,issue_url,number,state,locked,...,review_comments_url,review_comment_url,comments_url,statuses_url,head,base,_links,author_association,auto_merge,active_lock_reason
0,https://api.github.com/repos/ih-datapt-mad/dat...,1558177807,PR_kwDOKTHW1M5c3-gP,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://api.github.com/repos/ih-datapt-mad/dat...,106,open,False,...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,{'label': 'Guillemorato:error-handling-guille'...,"{'label': 'ih-datapt-mad:main', 'ref': 'main',...",{'self': {'href': 'https://api.github.com/repo...,NONE,,
1,https://api.github.com/repos/ih-datapt-mad/dat...,1557503243,PR_kwDOKTHW1M5c1Z0L,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://api.github.com/repos/ih-datapt-mad/dat...,105,open,False,...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,"{'label': 'tomasmorin97:error-handling', 'ref'...","{'label': 'ih-datapt-mad:main', 'ref': 'main',...",{'self': {'href': 'https://api.github.com/repo...,NONE,,
2,https://api.github.com/repos/ih-datapt-mad/dat...,1557498093,PR_kwDOKTHW1M5c1Yjt,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://api.github.com/repos/ih-datapt-mad/dat...,104,open,False,...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,{'label': 'Guillemorato:list-comprension-guill...,"{'label': 'ih-datapt-mad:main', 'ref': 'main',...",{'self': {'href': 'https://api.github.com/repo...,NONE,,
3,https://api.github.com/repos/ih-datapt-mad/dat...,1556968958,PR_kwDOKTHW1M5czXX-,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://api.github.com/repos/ih-datapt-mad/dat...,103,open,False,...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,{'label': 'SilviaAlconGarrido:numpy-Silvia-Alc...,"{'label': 'ih-datapt-mad:main', 'ref': 'main',...",{'self': {'href': 'https://api.github.com/repo...,NONE,,
4,https://api.github.com/repos/ih-datapt-mad/dat...,1556926203,PR_kwDOKTHW1M5czM77,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://github.com/ih-datapt-mad/dataptmad0923...,https://api.github.com/repos/ih-datapt-mad/dat...,102,open,False,...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,https://api.github.com/repos/ih-datapt-mad/dat...,"{'label': 'AndrewBavuels:numpy', 'ref': 'numpy...","{'label': 'ih-datapt-mad:main', 'ref': 'main',...",{'self': {'href': 'https://api.github.com/repo...,NONE,,


In [39]:
urls = df_json['diff_url']

urls[0]

'https://github.com/ih-datapt-mad/dataptmad0923_labs/pull/106.diff'

In [41]:
urls.to_json('datasets/urls.json')

---

__.XLSX Files__

In [None]:
# Additional libraries for Excel files

#!conda install -y xlrd
#!conda install -y openpyxl

In [None]:
%pip install xlrd
%pip install openpyxl

In [42]:
df_xlsx = pd.read_excel('datasets/2023_Accidentalidad.xlsx')

df_xlsx.head()

Unnamed: 0,num_expediente,fecha,hora,localizacion,numero,cod_distrito,distrito,tipo_accidente,estado_meteorológico,tipo_vehiculo,tipo_persona,rango_edad,sexo,cod_lesividad,lesividad,coordenada_x_utm,coordenada_y_utm,positiva_alcohol,positiva_droga
0,2023S000001,2023-01-01,01:15:00,"AVDA. ALFONSO XIII, 33",33,5,CHAMARTÍN,Alcance,Despejado,Todo terreno,Conductor,De 55 a 59 años,Mujer,14.0,Sin asistencia sanitaria,443397.166,4478129.388,N,
1,2023S000001,2023-01-01,01:15:00,"AVDA. ALFONSO XIII, 33",33,5,CHAMARTÍN,Alcance,Despejado,Todo terreno,Pasajero,De 21 a 24 años,Hombre,14.0,Sin asistencia sanitaria,443397.166,4478129.388,N,
2,2023S000001,2023-01-01,01:15:00,"AVDA. ALFONSO XIII, 33",33,5,CHAMARTÍN,Alcance,Despejado,Todo terreno,Pasajero,De 21 a 24 años,Hombre,14.0,Sin asistencia sanitaria,443397.166,4478129.388,N,
3,2023S000001,2023-01-01,01:15:00,"AVDA. ALFONSO XIII, 33",33,5,CHAMARTÍN,Alcance,Despejado,Todo terreno,Pasajero,De 21 a 24 años,Mujer,7.0,Asistencia sanitaria sólo en el lugar del acci...,443397.166,4478129.388,N,
4,2023S000001,2023-01-01,01:15:00,"AVDA. ALFONSO XIII, 33",33,5,CHAMARTÍN,Alcance,Despejado,Todo terreno,Pasajero,De 55 a 59 años,Hombre,7.0,Asistencia sanitaria sólo en el lugar del acci...,443397.166,4478129.388,N,


---

__.PARQUET Files__

In [None]:
# Additional library for Parquet files

#!conda install -c conda-forge pyarrow

In [None]:
%pip install pyarrow

In [44]:
df_parquet = pd.read_parquet('datasets/part.parquet')

df_parquet.head()

Unnamed: 0_level_0,id,names,amount
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23,Ursula,1380
1,92,Tim,-178
2,239,Alice,-784
3,260,Michael,-249
4,314,Jerry,1299


In [45]:
df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   id      1000000 non-null  int64 
 1   names   1000000 non-null  object
 2   amount  1000000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 30.5+ MB


In [50]:
tocho = pd.concat([df_parquet, df_parquet]).reset_index()

tocho.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 4 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   __null_dask_index__  int64 
 1   id                   int64 
 2   names                object
 3   amount               int64 
dtypes: int64(3), object(1)
memory usage: 61.0+ MB


In [51]:
tocho.to_parquet('datasets/tocho.parquet')

In [52]:
df_tocho = pd.read_parquet('datasets/tocho.parquet')

df_tocho.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 4 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   __null_dask_index__  int64 
 1   id                   int64 
 2   names                object
 3   amount               int64 
dtypes: int64(3), object(1)
memory usage: 61.0+ MB


In [54]:
df_tocho.head()

Unnamed: 0,__null_dask_index__,id,names,amount
0,0,23,Ursula,1380
1,1,92,Tim,-178
2,2,239,Alice,-784
3,3,260,Michael,-249
4,4,314,Jerry,1299


In [55]:
df_tocho.tail()

Unnamed: 0,__null_dask_index__,id,names,amount
1999995,999995,60,Oliver,176
1999996,999996,440,Sarah,1292
1999997,999997,24,Victor,559
1999998,999998,66,George,2093
1999999,999999,39,Frank,204


In [60]:
df_xlsx.select_dtypes('number').to_parquet('datasets/desde_excel.parquet')

In [61]:
df_xlsx = pd.read_parquet('datasets/desde_excel.parquet')

df_xlsx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007 entries, 0 to 4006
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cod_distrito      4007 non-null   int64  
 1   cod_lesividad     2030 non-null   float64
 2   coordenada_x_utm  4004 non-null   float64
 3   coordenada_y_utm  4004 non-null   float64
 4   positiva_droga    12 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 156.6 KB


In [70]:
pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst')

Unnamed: 0,55 0 81 0 -6 11 25 88 64 4
0,56 0 96 0 52 -4 40 44 4 4
1,50 -1 89 -7 50 0 39 40 2 1
2,53 9 79 0 42 -2 25 37 12 4
3,55 2 82 0 54 -6 26 28 2 1
4,41 0 84 3 38 -4 43 45 2 1
...,...
14494,80 0 84 0 -36 -29 4 120 116 5
14495,55 0 81 0 -20 25 26 102 76 4
14496,55 0 77 0 12 -22 22 65 42 4
14497,37 0 103 0 18 -16 66 85 20 1


---

__SQL Files...in the next episode...__