#### Reading Data from Various Data Sources using Pandas

In [22]:
import pandas as pd
from io import StringIO

## Sample Key Value pair data
data = '{"Name": "Ramesh", "Age": 26, "Gender": "Male", "Designation": [{"title1":"Scientist", "title2": "Researcher"}]}'
type(StringIO(data))
## Converted to StringIO

_io.StringIO

In [24]:
## To convert the StringIO data into DataFrame

df = pd.read_json(StringIO(data))
df

Unnamed: 0,Name,Age,Gender,Designation
0,Ramesh,26,Male,"{'title1': 'Scientist', 'title2': 'Researcher'}"


In [34]:
## To convert the DataFrame to Json

df.to_json()
## By default, orient value is set to index/row value
## That's why there is a slight change in the format of original sample data in first cell and converted data

'{"Name":{"0":"Ramesh"},"Age":{"0":26},"Gender":{"0":"Male"},"Designation":{"0":{"title1":"Scientist","title2":"Researcher"}}}'

In [29]:
## Set the orient to 'index' for clear understanding

df.to_json(orient = 'index')
## In both the cases, row/index value was used while converting to Json data
## That's why there is a change in the format of Json data compared to the original sample data

'{"0":{"Name":"Ramesh","Age":26,"Gender":"Male","Designation":{"title1":"Scientist","title2":"Researcher"}}}'

In [37]:
## Inorder to change the format similar to the original data, we must set orient to 'records'ArithmeticError

df.to_json(orient = 'records')
## So that we can exclude the index value 0

'[{"Name":"Ramesh","Age":26,"Gender":"Male","Designation":{"title1":"Scientist","title2":"Researcher"}}]'

In [47]:
## To read csv data from URL
## This data is not in local machine right now

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
## Header = None because there is no header in the source data
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [50]:
## To convert the wine.data from dataframe to csv and import into the local machine

df.to_csv('wine.csv')
## This line alone converts the DataFrame to CSV file and imports in the local machine

In [71]:
## To read the data from a HTML page

url = "https://fdic.gov/bank-failures/failed-bank-list"

In [73]:
df = pd.read_html(url)
df


[                                            Bank Name                City  \
 0                        The Santa Anna National Bank          Santa Anna   
 1                                Pulaski Savings Bank             Chicago   
 2                  The First National Bank of Lindsay             Lindsay   
 3               Republic First Bank dba Republic Bank        Philadelphia   
 4                                       Citizens Bank            Sac City   
 5                            Heartland Tri-State Bank             Elkhart   
 6                                 First Republic Bank       San Francisco   
 7                                      Signature Bank            New York   
 8                                 Silicon Valley Bank         Santa Clara   
 9                                   Almena State Bank              Almena   
 10                         First City Bank of Florida   Fort Walton Beach   
 11                               The First State Bank       Bar

In [64]:
df[0]
## This is how we can read a HTML page which has a table in it

Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,The Santa Anna National Bank,Santa Anna,Texas,5520,Coleman County State Bank,"June 27, 2025",10549
1,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
2,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,First Bank & Trust Co.,"October 18, 2024",10547
3,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
4,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
5,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
6,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
7,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
8,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
9,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538


In [81]:
## Reading data from another URL (W3Schools)

url = "https://www.w3schools.com/html/html_tables.asp"

In [87]:
df = pd.read_html(url)
df
## This HTMl page has 2 tables

[                        Company           Contact  Country
 0           Alfreds Futterkiste      Maria Anders  Germany
 1    Centro comercial Moctezuma   Francisco Chang   Mexico
 2                  Ernst Handel     Roland Mendel  Austria
 3                Island Trading     Helen Bennett       UK
 4  Laughing Bacchus Winecellars   Yoshi Tannamuri   Canada
 5  Magazzini Alimentari Riuniti  Giovanni Rovelli    Italy,
           Tag                                        Description
 0     <table>                                    Defines a table
 1        <th>                   Defines a header cell in a table
 2        <tr>                           Defines a row in a table
 3        <td>                          Defines a cell in a table
 4   <caption>                            Defines a table caption
 5  <colgroup>  Specifies a group of one or more columns in a ...
 6       <col>  Specifies column properties for each column wi...
 7     <thead>               Groups the header cont

In [88]:
## First table in the HTML page
df[0]

Unnamed: 0,Company,Contact,Country
0,Alfreds Futterkiste,Maria Anders,Germany
1,Centro comercial Moctezuma,Francisco Chang,Mexico
2,Ernst Handel,Roland Mendel,Austria
3,Island Trading,Helen Bennett,UK
4,Laughing Bacchus Winecellars,Yoshi Tannamuri,Canada
5,Magazzini Alimentari Riuniti,Giovanni Rovelli,Italy


In [89]:
## Second Table in the HTML page
df[1]

Unnamed: 0,Tag,Description
0,<table>,Defines a table
1,<th>,Defines a header cell in a table
2,<tr>,Defines a row in a table
3,<td>,Defines a cell in a table
4,<caption>,Defines a table caption
5,<colgroup>,Specifies a group of one or more columns in a ...
6,<col>,Specifies column properties for each column wi...
7,<thead>,Groups the header content in a table
8,<tbody>,Groups the body content in a table
9,<tfoot>,Groups the footer content in a table


In [95]:
## To load only required table using match keyword from the HTML page
df = pd.read_html(url, match='Company')
df
## By this, only 1 table (which has 'Company' coulmn) from HTML page has been loaded eventhough the page consists of 2 tables

[                        Company           Contact  Country
 0           Alfreds Futterkiste      Maria Anders  Germany
 1    Centro comercial Moctezuma   Francisco Chang   Mexico
 2                  Ernst Handel     Roland Mendel  Austria
 3                Island Trading     Helen Bennett       UK
 4  Laughing Bacchus Winecellars   Yoshi Tannamuri   Canada
 5  Magazzini Alimentari Riuniti  Giovanni Rovelli    Italy]

In [93]:
df[0]

Unnamed: 0,Company,Contact,Country
0,Alfreds Futterkiste,Maria Anders,Germany
1,Centro comercial Moctezuma,Francisco Chang,Mexico
2,Ernst Handel,Roland Mendel,Austria
3,Island Trading,Helen Bennett,UK
4,Laughing Bacchus Winecellars,Yoshi Tannamuri,Canada
5,Magazzini Alimentari Riuniti,Giovanni Rovelli,Italy


In [94]:
df[1]

IndexError: list index out of range

In [100]:
## To read data from excel file

pd.read_excel('excelData.xlsx')
## Here, the header content was also considered for a record. To change that, we need to set header to 1

Unnamed: 0,Table 1,Unnamed: 1
0,Name,Age
1,Sai,23
2,Arjun,28
3,Gowtham,25


In [101]:
df_excel = pd.read_excel('excelData.xlsx', header=1)
df_excel

Unnamed: 0,Name,Age
0,Sai,23
1,Arjun,28
2,Gowtham,25


#### Pickle File

1. Pickle file is a binary file that stores the serialized Python objects created using the Pickle module

2. “Pickling” converts a Python object (like a list, dict, model, or DataFrame) into a byte stream that can be written to disk, often with extensions like .pkl or .pickle

3. “Unpickling” reads that byte stream back and reconstructs the original Python object in memory, preserving its structure and values.
​

In [104]:
## Converting the excel file to Pickle file

df_excel.to_pickle('pickleFile')
## A new pickle file with the name "pickleFile" will be created in the local machine

In [103]:
## To read the pickle file

df = pd.read_pickle('pickleFile')
df

Unnamed: 0,Name,Age
0,Sai,23
1,Arjun,28
2,Gowtham,25
