In [1]:
# Load library
import pandas as pd
# Create URL
url = 'https://machine-learning-python-cookbook.s3.amazonaws.com/data.parquet'
# Load data
dataframe = pd.read_parquet(url)
# View the first two rows
dataframe.head(2)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0


2.7 Loading an Avro File

In [5]:
! pip install fastavro pandas




In [6]:
import requests
import pandas as pd
from fastavro import reader
from io import BytesIO

# دانلود فایل Avro
url = 'https://machine-learning-python-cookbook.s3.amazonaws.com/data.avro'
r = requests.get(url)

# خوندن فایل با fastavro
with BytesIO(r.content) as f:
    avro_reader = reader(f)
    dataframe = pd.DataFrame(list(avro_reader))

print(dataframe.head(2))


   integer             datetime  category
0        5  2015-01-01 00:00:00         0
1        5  2015-01-01 00:00:01         0


2.8 Querying a SQLite Database

In [7]:
! pip install sqlalchemy

Collecting sqlalchemy
  Downloading sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.3-cp312-cp312-win_amd64.whl.metadata (4.2 kB)
Downloading sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.1 MB ? eta -:--:--
   -------------- ------------------------- 0.8/2.1 MB 2.2 MB/s eta 0:00:01
   ------------------------ --------------- 1.3/2.1 MB 2.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.8/2.1 MB 2.7 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 2.6 MB/s eta 0:00:00
Downloading greenlet-3.2.3-cp312-cp312-win_amd64.whl (297 kB)
Installing collected packages: greenlet, sqlalchemy

   ---------------------------------------- 0/2 [greenlet]
   -----------------------------------

In [None]:
# Load libraries
import pandas as pd
from sqlalchemy import create_engine
# Create a connection to the database
database_connection = create_engine('sqlite:///sample.db')
# Load data
dataframe = pd.read_sql_query('SELECT * FROM data', database_connection)
# View first two rows
dataframe.head(2)

2.9 Querying a Remote SQL Database

In [9]:
! pip install pymysql

Collecting pymysql
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
Installing collected packages: pymysql
Successfully installed pymysql-1.1.1


In [None]:
# Import libraries
import pymysql
import pandas as pd
# Create a DB connection
# Use the following example to start a DB instance
# https://github.com/kylegallatin/mysql-db-example
conn = pymysql.connect(
host='localhost',
user='root',
password = "",
db='db',
)
# Read the SQL query into a dataframe
dataframe = pd.read_sql("select * from data", conn)
# View the first two rows
dataframe.head(2)

2.10 Loading Data from a Google Sheet

In [11]:
# Import libraries
import pandas as pd
# Google Sheet URL that downloads the sheet as a CSV
url = "https://docs.google.com/spreadsheets/d/"\
"1ehC-9otcAuitqnmWksqt1mOrTRCL38dv0K9UjhwzTOA/export?format=csv"
# Read the CSV into a dataframe
dataframe = pd.read_csv(url)
# View the first two rows
dataframe.head(2)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 0:00:00,0
1,5,2015-01-01 0:00:01,0


2.11 Loading Data from an S3 Bucket

In [12]:
! pip install s3fs

Collecting s3fs
  Downloading s3fs-2025.5.1-py3-none-any.whl.metadata (1.9 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.23.0-py3-none-any.whl.metadata (24 kB)
Collecting fsspec==2025.5.1 (from s3fs)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.38.28,>=1.38.23 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.38.27-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Downloading s3fs-2025.5.1-py3-none-any.whl (30 kB)
Downloading fsspec-2025.5.1-py3-none-any.whl (199 kB)
Downloading aiobotocore-2.23.0-py3-none-any.whl (84 kB)
Downloading aioitertools-0.12.0-py3-none-any.whl (24 kB)
Downloading botocore-1.38.27-py3-none-any.whl (13.6 MB)
 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.6.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.5.1 which is incompatible.


In [13]:
# Import libraries
import pandas as pd
# S3 path to CSV
s3_uri = "s3://machine-learning-python-cookbook/data.csv"
# Set AWS credentials (replace with your own)
ACCESS_KEY_ID = "xxxxxxxxxxxxx"
SECRET_ACCESS_KEY = "xxxxxxxxxxxxxxxx"
# Read the CSV into a dataframe
dataframe = pd.read_csv(s3_uri,storage_options={
"key": ACCESS_KEY_ID,
"secret": SECRET_ACCESS_KEY,
}
)
# View first two rows
dataframe.head(2)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0


2.12 Loading Unstructured Data

In [14]:
! pip install requests



In [15]:
import requests
# URL to download the txt file from
txt_url = "https://machine-learning-python-cookbook.s3.amazonaws.com/text.txt"
# Get the txt file
r = requests . get ( txt_url )
# Write it to text.txt locally
with open ( 'text.txt' , 'wb' ) as f:
  f.write ( r . content )
# Read in the file
with open ( 'text.txt' , 'r' ) as f:
  text = f.read ()
# Print the content
print ( text )

Hello there!
