## Reading and Writing Data in Text Format

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("examples/ex1.csv")

In [3]:
# This file has a header row
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
# but a file will not always have a header row. let's see this
pd.read_csv("examples/ex2.csv")

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [5]:
# To read this file, we have a couple of options. we can allow pandas to assign default
# column names, or specify names ourself:
# first let's tell pandas the file has not its own header so give default header row
pd.read_csv("examples/ex2.csv", header = None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# let's specify the column names
pd.read_csv("examples/ex2.csv", names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Message'])

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# If we want to the "message" column to be the row index of the returned dataframe
pd.read_csv("examples/ex2.csv", names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Message'], index_col= "Message")

Unnamed: 0_level_0,Monday,Tuesday,Wednesday,Thursday
Message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [8]:
pd.read_csv("examples/csv_mindex.csv")

Unnamed: 0,key1,key2,value1,value2
0,one,a,1,2
1,one,b,3,4
2,one,c,5,6
3,one,d,7,8
4,two,a,9,10
5,two,b,11,12
6,two,c,13,14
7,two,d,15,16


In [9]:
# If we want to form a hierarchical index from multiple columns, we can pass a list of column numbers or names
parsed = df = pd.read_csv("examples/csv_mindex.csv", index_col = ['key1', 'key2'])

In [10]:
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [11]:
# the default separarator for values in csv file is comma
# but what if the values are separated by other separators
# for example this the space separeted text file we can pass a regular expression argument to read_csv '\s+'
pd.read_csv("examples/ex3.txt", sep = r"\s+")

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [12]:
# we can skip rows from a file
# in the file below we skipped row 0, 2, 3
pd.read_csv("examples/ex4.csv", skiprows = [0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
# let's open a csv fill with missing data(either empty string or marked by some palce holders like NA and NULL)
df = pd.read_csv("examples/ex5.csv")

In [14]:
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [15]:
# let's use isna() function to check if there is a missing value
pd.isna(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [16]:
# The na_values option accepts a sequence of strings to add to the default list of strings
# recognized as missing:
# in this case we add "NULL" to default missing list of strings recognized as missing by pandas
df = pd.read_csv("examples/ex5.csv", na_values = ['NULL'])

In [17]:
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [18]:
# pandas.read_csv has a list of many default NA value representations, but these
# defaults can be disabled with the keep_default_na option:
df = pd.read_csv('examples/ex5.csv', keep_default_na = False)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [19]:
# let's check for the missing value
# since we scrap the pandas default NA values the isna() function returns there is no missing value
pd.isna(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False


In [20]:
# if we want to catch a specific missing values we can pass na_values with that specific placeholder name
# for example let's catch 'NA' place holder
pd.read_csv('examples/ex5.csv', keep_default_na = False, 
                na_values = ["NA"])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [21]:
# now lets catch empty string
pd.read_csv('examples/ex5.csv', keep_default_na = False, 
                na_values = [""])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [22]:
pd.read_csv('examples/ex5.csv', keep_default_na = False)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [23]:
# we can specify different NA sentinels for different column
# for example let's make 'foo' and 'NA' as NA sentinels from the message column
# 'two' from 'something column'
sentinels = {"message": ["foo", "NA"], "something": ["two"]}
df1 = pd.read_csv('examples/ex5.csv', keep_default_na = False, na_values = sentinels)
df1

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


In [24]:
# check for missing values
pd.isna(df1)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,True,False,False,False,False,False
2,False,False,False,False,False,True


###  Reading Text Files in Pieces

In [25]:
# Before we look at a large file, we make the pandas display settings more compact:
pd.options.display.max_rows = 10

In [26]:
f = pd.read_csv("examples/ex6.csv")
f

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [27]:
pd.reset_option('display.max_rows')

In [28]:
# if we want to read only a smaller number of rows, we can specify the number of rows
# we can read the first 6 rows by passing "nrows = 6" argument 
pd.read_csv("examples/ex6.csv", nrows = 12)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [29]:
# To read a file in pieces, specify a chunksize as a number of rows:
file = pd.read_csv("examples/ex6.csv", chunksize = 1000)
type(file)

pandas.io.parsers.readers.TextFileReader

In [30]:
# Let's count the frequency of each "key" values
tot = pd.Series([], dtype = 'int64')
for piece in file:
    tot = tot.add(piece['key'].value_counts(), fill_value = 0)

tot = tot.sort_values(ascending = False)
tot[:10]

key
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

###  Writing Data to Text Format

In [31]:
# let's write a given data to "1.csv" file
data = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [25, 30]})
data.to_csv('1.csv', index = False) # "index = False" argument used to avoid default row index from writting to the file

In [32]:
# Data can also be exported to a delimited format. Let’s consider one of the CSV files read before:
data = pd.read_csv("examples/ex5.csv")
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [33]:
# we can write the above data to new file
data.to_csv("data.csv", index = False)

In [34]:
# we can print to the console by writting to "sys.stdout"
import sys
data.to_csv(sys.stdout, index = False)

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [35]:
# of course we can pass other delimiter
data.to_csv(sys.stdout, sep = '|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [36]:
#  Missing values appear as empty strings in the output. we might want to denote them
# by some other sentinel value:
data.to_csv(sys.stdout, sep = '|', na_rep = "NULL")

|something|a|b|c|d|message
0|one|1|2|3.0|4|NULL
1|two|5|6|NULL|8|world
2|three|9|10|11.0|12|foo


In [37]:
# With no other options specified, both the row and column labels are written. Both of
# these can be disabled:
data.to_csv(sys.stdout, header = False, index = False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [38]:
# we can also write only a subset of the columns, and in an order of our choosing:
data.to_csv(sys.stdout, index = False, columns = ['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### Working with Other Delimited Format

In [39]:
pd.read_csv("examples/ex7.csv")

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3


In [40]:
# we can use csv module
import csv
with open("examples/ex7.csv") as f:
    file = csv.reader(f)
    for line in file:
        print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [41]:
#  From there, it’s up to us to do the wrangling necessary to put the data in the form
# that we need. Let’s take this step by step. First, we read the file into a list of lines:
with open("examples/ex7.csv") as f:
    file = csv.reader(f)
    lines = list(file)
    header, values = lines[0], lines[1:]
    # lets create a dictionary
    d = {h: v for h, v in zip(header, zip(*values))}
    print(d)

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}


### JSON Data

### Handling JSON in Python (json module)

In [42]:
import json
f = '''{
    "name": "Alice",
    "age": 25,
    "city": "New York",
    "skills": ["Python", "Machine Learning"]
}
'''

data = json.loads(f)

In [43]:
data

{'name': 'Alice',
 'age': 25,
 'city': 'New York',
 'skills': ['Python', 'Machine Learning']}

In [44]:
data['name']

'Alice'

In [45]:
data['skills']

['Python', 'Machine Learning']

In [46]:
# convert python dictionaries to JSON
python_dict = {"name": "Bob", "age": 30, "city": "London"}

json_str = json.dumps(python_dict, indent = 4)

In [47]:
print(json_str)

{
    "name": "Bob",
    "age": 30,
    "city": "London"
}


In [48]:
obj = """
 {"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
              {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
 }
 """

In [49]:
result = json.loads(obj)

In [50]:
result

{'name': 'Wes',
 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
  {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}

In [51]:
# access values
result['name']

'Wes'

In [52]:
result['cities_lived']

['Akron', 'Nashville', 'New York', 'San Francisco']

In [54]:
for sibling in result['siblings']:
    print(f"Name: {sibling['name']}, Age: {sibling['age']}")

Name: Scott, Age: 34
Name: Katie, Age: 42


In [None]:
with open("datasets/CA_category_id.json", "r", encoding = "utf-8") as f:
    json_data = f.read()

In [None]:
data = json.loads(json_data)

In [None]:
data['kind']

In [None]:
print('....Title....')
for items in data['items']:
    print(items['snippet']['title'])

### Writing JSON Files

In [None]:
data = {
    "name": "Alice",
    "age": 25,
    "city": "New York"
}

with open('1.json', 'w') as f:
    json.dump(data, f, indent = 4)

### Read JSON in pandas

In [None]:
data = pd.read_json("examples/example.json")

In [None]:
data

In [None]:
import sys
data.to_json(sys.stdout, indent = 4, orient = "records")

## XML and HTML: Web Scraping

In [None]:
tables = pd.read_html("examples/fdic_failed_bank_list.html")

In [None]:
len(tables)

In [None]:
failures = tables[0]

In [None]:
failures.head()

In [None]:
failures.nunique()