# Urban Informatics
# Module 03: Intro to pandas, Part I

In [None]:
import numpy as np
import pandas as pd

## 1. Introduce numpy arrays

In [None]:
# a python list is a basic data type
my_list = [1, 2, 3, 4]
my_list

In [None]:
# a numpy array is like a list, but faster and more compact
my_array = np.array([1, 2, 3, 4])
my_array

In [None]:
# you can create a numpy array from an existing list too
my_array = np.array(my_list)
my_array

In [None]:
rain_list = [5.3, 5.4, 4.8, 4.7, 3.3, 1.2, 0.8, 0.0, 0.2, 3.9, 4.5, 5.9]
rain = np.array(rain_list)

In [None]:
type(rain)

In [None]:
rain.dtype

numpy has several mathematical functions built-in. Here are some examples.

In [None]:
np.mean(rain)

In [None]:
round(np.mean(rain), 1)

In [None]:
np.std(rain)

In [None]:
np.var(rain)

In [None]:
np.median(rain)

In [None]:
np.min(rain)

In [None]:
np.max(rain)

In [None]:
np.sum(rain)

In [None]:
# look at the original rainfall list
rain_list

In [None]:
# multiply it by two
rain_list * 2

In [None]:
# look at the rainfall array
rain

In [None]:
# multiply it by two 
rain * 2

## 2. Introduce pandas Series

In [None]:
# a pandas series is based on a numpy array - it's fast, compact, and has more functionality
# perhaps most notably, it has an index
my_list = [1, 2, 3, 4]
my_series = pd.Series(my_list)
my_series

In [None]:
# you can create a new Series by passing in a list variable or array
# a series can contain data types other than just integers
series2 = pd.Series(['a', 'b', 'c', 'd'])
series2

In [None]:
series2.index

In [None]:
series2.values

In [None]:
type(series2.values)

In [None]:
# you can change a series's index
series2.index = ['w', 'x', 'y', 'z']
series2

## 3. Introduce pandas DataFrames

In [None]:
# a pandas dataframe is like a table where each column is a series
df = pd.DataFrame([1, 2, 3, 4])
df

this is a one-dimensional (i.e., one-column) DataFrame... it's similar to a Series

In [None]:
# a dict can contain multiple lists and label them
my_dict = {'variable_a':[1, 2, 3, 4], 'variable_b':[5, 6, 7, 8]}
my_dict

In [None]:
# a pandas dataframe can contain multiple columns/series
# you can create a dataframe by passing in a list, array, series, or dict
df = pd.DataFrame(my_dict)
df

In [None]:
# the row labels in the index are accessed by the .index attribute of the DataFrame object
df.index

In [None]:
df.index.tolist()

In [None]:
# the column labels are accessed by the .columns attribute of the DataFrame object
df.columns

In [None]:
# the data values are accessed by the .values attribute of the DataFrame object
df.values

In [None]:
# make sure your indices match!
dict2 = {'column1':my_series, 'column2':series2}
df = pd.DataFrame(dict2)
df

In [None]:
# numpy offers a useful datatype called NaN for null values, used in pandas
x = np.nan
print(type(x))
x

## 4. Working with CSV files

Notice what pandas's `read_csv` function does:

1. recognize the header row and get its variable names
1. read all the rows and construct a pandas DataFrame, an assembly of pandas Series
1. construct a unique index, beginning with zero
1. infer the data type of each variable (ie, column)

In [None]:
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df = pd.read_csv('data/rain.csv') # path relative to notebook file
df

In [None]:
# dataframe shape as rows, columns
df.shape

In [None]:
# datatypes of the columns
df.dtypes

#### We can select subsets of the rows by indexing, and select specific columns by their name:

In [None]:
# a column is a pandas series
type(df['rainfall_inches'])

In [None]:
# so is a row
type(df.loc[0])

In [None]:
# view a column
df['rainfall_inches']

In [None]:
# sort the values
df.sort_values(by='rainfall_inches', ascending=False)

In [None]:
# view the "head" of the dataframe
df.head()

In [None]:
# or view its "tail"
df.tail()

In [None]:
# first 6 items in a column
df['rainfall_inches'][:6]

In [None]:
# final 6 rows in a dataframe
df[6:]

In [None]:
# summary descriptive stats
df['rainfall_inches'].describe()

It silently handles the missing value for September and gave the correct statistical results. These are essentially Numpy functions, but in pandas we can now deal with multiple data types and columns.

In [None]:
df['rainfall_inches'].min()

In [None]:
df['rainfall_inches'].max()

In [None]:
df['rainfall_inches'].idxmax()

In [None]:
df['rainfall_inches'].median()

In [None]:
df['rainfall_inches'].mean()

In [None]:
df['rainfall_inches'].std()

In [None]:
# now it's your turn
# how would you compute the total rainfall inches between march and august?


#### More DataFrame functionality

In [None]:
# load a new data file
df2 = pd.read_csv('data/cities.csv')

In [None]:
df2.shape

In [None]:
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head(4)

In [None]:
# you can add a new (empty) column to a DataFrame
df2['country'] = np.nan
df2

In [None]:
# you can update the values of an entire column all at once
df2['country'] = 'USA'
df2

In [None]:
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2

In [None]:
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].str.replace('United States', 'USA')
df2

That didn't do anything to our dataframe because .str.replace() returns the updated version - it doesn't perform the operation in place

In [None]:
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2

In [None]:
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2

In [None]:
# or just rename a single column, passing a dict into the rename() method
df2 = df2.rename(columns={'city_name':'city'})
df2

In [None]:
df2.drop(columns=['nation'])

In [None]:
# you can save your DataFrame as a csv file
df2.to_csv('data/my_new_dataset.csv')

In [None]:
# now it's your turn
# rename all the columns to new names and save to disk as a new file


### Filtering on values

You can easily filter a dataframe for one or more conditions based on the values in a column. Below we filter df to select only months with less than 3 inches of rainfall.  

In [None]:
df[df['rainfall_inches'] < 3]

In [None]:
# what exactly did that do?
df['rainfall_inches'] < 3

In [None]:
# essentially a true/false mask that filters by value
mask = df['rainfall_inches'] < 3
df[mask]

In [None]:
pd.isnull(df)

In [None]:
pd.notnull(df['rainfall_inches'])

You can also select rows based on the values of more than one column. Just remember to nest the individual conditions within parentheses.

In [None]:
df[(df['month'] != 'jul') & (pd.notnull(df['rainfall_inches']))]

What's that funny ampersand doing there? We'll learn more next week!

In [None]:
# now it's your turn
# what is the average rainfall in months with at least 2 inches of rain?


### Editing strings and changing data types

In [None]:
df['rainfall_inches']

In [None]:
# count non-nan cells
df['rainfall_inches'].count()

In [None]:
# count all cells, including nans
len(df['rainfall_inches'])

In [None]:
df['rainfall_inches'].astype(int)

In [None]:
# fill missing values with a default value (or you could drop them instead!)
df['rainfall_inches'] = df['rainfall_inches'].fillna(0)

In [None]:
# now convert to int
df['rainfall_inches'].astype(int)

In [None]:
# remember to re-assign to capture the output!
df['rainfall_inches'] = df['rainfall_inches'].astype(int)

In [None]:
df[df['month'].str.contains('j')]

In [None]:
# you can do stats on a filtered subset
df[df['month'].str.contains('j')]['rainfall_inches'].mean()

In [None]:
df = pd.DataFrame({'price' : ['$10.00', '$5.53', '7 dollars'], 'store' : ['CVS', 'Walgreens', 'CVS']})
df

In [None]:
df['store'].unique()

In [None]:
df['price'].astype(float)

In [None]:
df['price'] = df['price'].str.replace('$', '').str.replace('dollars', '').str.strip()
df

In [None]:
df['price'].astype(float)