# Demo: Pandas

## Two important datatypes in Pandas
* Series (like a vector or array)
* Dataframe (like a 2-D array or Excel spreadsheet)

In [1]:
import pandas as pd

population_dict = {
    'California': 38_332_521,
         'Texas': 26_448_193,
      'New York': 19_651_127,
       'Florida': 19_552_860,
      'Illinois': 12_882_135
}
# create a series from a Python dict
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [2]:
area_dict = {
    'California': 423_967,
         'Texas': 695_662,
      'New York': 141_297,
       'Florida': 170_312,
      'Illinois': 149_995
}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [3]:
# Create a DataFrame from two dicts–each will
# be a column in the new DataFrame.
states = pd.DataFrame({'population': population, 'area': area})
states
# Note that print(states) doesn't look as nice.
# That's because just typing 'states' as above invokes
# the display() function for DataFrames:
# from IPython.display import display
#
# display(states)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [4]:
# DataFrames have an index that we can inspect (or change)
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [5]:
# View column names
states.columns

Index(['population', 'area'], dtype='object')

In [6]:
# View a specific column
states['area'] # or states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [8]:
# Generate a Boolean series based on a Boolean condition
# e.g., Which states have an area > 150,000 sq. miles?
 # or, Which states have a population > 20,000,000
large_area = states['area'] > 150_000
large_area

California     True
Texas          True
New York      False
Florida        True
Illinois      False
Name: area, dtype: bool

In [9]:
large_pop = states['population'] > 20_000_000
# We can use the & operator (bitwise AND) to combine conditions
states[large_area & large_pop]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662


In [10]:
# We can see that under the hood, the values in a DataFrame
# are represented as a matrix or 2-D array
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]])

## Reading CSV files into __`pandas`__

In [12]:
# Read data from a CSV file
data = pd.read_csv('https://raw.githubusercontent.com/davewadestein/Gap-Python-2025/refs/heads/main/Data/skincancer.csv')

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1018)>

In [4]:
# Show the first n rows, default = 5
data.head(10)

Unnamed: 0,State,Lat,Mort,Ocean,Long
0,Alabama,33.0,219,1,87.0
1,Arizona,34.5,160,0,112.0
2,Arkansas,35.0,170,0,92.5
3,California,37.5,182,1,119.5
4,Colorado,39.0,149,0,105.5
5,Connecticut,41.8,159,1,72.8
6,Delaware,39.0,200,1,75.5
7,"Wash, D.C.",39.0,177,0,77.0
8,Florida,28.0,197,1,82.0
9,Georgia,33.0,214,1,83.5


In [None]:
# Show the "shape" of the data, i.e., rows x columns
data.shape

## Don't treat first line as header

In [None]:
data = pd.read_csv('skincancer.csv', header=None)

In [None]:
data.head()

In [None]:
data.shape

## Specify our own headers/column names

In [None]:
# We saw earlier that we can view the column names.
# We can also change them!
data.columns = 'State Latitude Mortality Ocean Longitude'.split()

In [None]:
data.head()