# In-class Demo: Importing Data

This notebook demonstrates reading a variety of data sources into Python/Pandas.


In [None]:
from pathlib import Path
import pandas as pd, json
BASE = Path().resolve().parents[1] if (Path().resolve().name == "notebooks") else Path().resolve()
DATA = BASE / "data" / "demo"
print("DATA =", DATA)


In [None]:
# --- CSV
csv_path = DATA / "csv" / "students.csv"
df_csv = pd.read_csv(csv_path)
print(df_csv.head())
# Common args: sep, header, index_col, nrows, encoding, usecols, dtype, compression
df_subset = pd.read_csv(csv_path, usecols=["id","name"], dtype={"id":"int64"})
print(df_subset.dtypes)

In [None]:
# --- TSV (tab-delimited)
tsv_path = DATA / "tsv" / "sales.tsv"
df_tsv = pd.read_csv(tsv_path, sep="\t")
print(df_tsv.head())

In [None]:
# --- Compressed CSV (gzip)
gz_path = DATA / "csv_gz" / "students.csv.gz"
df_gz = pd.read_csv(gz_path, compression="gzip")
print(df_gz.head())

In [None]:
# --- Excel (may require 'openpyxl')
excel_path = DATA / "excel" / "grades.xlsx"
try:
    df_excel = pd.read_excel(excel_path)  # default: first sheet
    print(df_excel.head())
    all_sheets = pd.read_excel(excel_path, sheet_name=None)
    print("Sheets:", list(all_sheets))
except Exception as e:
    print("Excel read failed:", e)

In [None]:
# --- JSON (flat)
import json
people_path = DATA / "json" / "people.json"
with open(people_path, "r", encoding="utf-8") as f:
    people = json.load(f)
print(people[0]["name"])

# --- JSON (nested) with json_normalize
nested_path = DATA / "json" / "orders_nested.json"
with open(nested_path, "r", encoding="utf-8") as f:
    orders = json.load(f)
from pandas import json_normalize
flat = json_normalize(orders, record_path="items", meta=["order_id", ["customer","id"], ["customer","name"]])
print(flat.head())

In [None]:
# --- Plain text with context manager
poem_path = DATA / "text" / "poem.txt"
with open(poem_path, "r", encoding="utf-8") as f:
    text = f.read()
print(text.splitlines()[:2])  # first 2 lines
with open(poem_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
print("line count:", len(lines))

In [None]:
# --- HTML tables (requires lxml)
html_path = DATA / "html" / "tables.html"
tables = pd.read_html(str(html_path))  # returns a list of DataFrames
print("Number of tables:", len(tables))
tables[0].head()