In [None]:
# import necessary modules
import pandas as pd, numpy as np

# Part 1 <br /> Reviewing the Basics

In [None]:
# a python list is a basic data type
li = [1, 2, 3, 4]
li

In [None]:
# a numpy array is like a list, but faster and more compact
ar = np.array([1, 2, 3, 4])
ar

In [None]:
# you can create a numpy array from an existing list too
ar = np.array(li)
ar

In [None]:
# a pandas series is based on a numpy array - it's fast, compact, and has more functionality
se1 = pd.Series(ar)
se1

In [None]:
# you can create a new Series by passing in a list variable or array
# series can contain data types other than just integers
se2 = pd.Series(['a', 'b', 'c', 'd'])
se2

the first "column" is an index, the second is the series of values

In [None]:
# you can change a series's index
se2.index = ['w', 'x', 'y', 'z']
se2

In [None]:
# a pandas dataframe is like a table where each column is a series
df = pd.DataFrame([1, 2, 3, 4])
df

this is a one-dimensional DataFrame... it's equivalent to a Series

In [None]:
# a dict can contain multiple lists and label them
di1 = {'column1':[1, 2, 3, 4], 'column2':[5, 6, 7, 8]}
di1

In [None]:
# a dict can also contain multiple lists/series and labels
di2 = {'column1':li, 'column2':se1}
di2

In [None]:
# a pandas dataframe can contain multiple columns/series
# you can create a dataframe by passing in a list, array, series, or dict
df = pd.DataFrame(di1)
df

this is a two-dimensional DataFrame. Its index contains row labels (0, 1, 2, 3) and its columns are indexed by column labels

In [None]:
# the row labels in the index are accessed by the .index attribute of the DataFrame object
print(df.index)

# the column labels are accessed by the .columns attribute of the DataFrame object
print(df.columns)

In [None]:
# make sure your indices match!
di = {'column1':se1, 'column2':se2}
df = pd.DataFrame(di)
df

In [None]:
# numpy offers a useful datatype called NaN for null values - has to be floating point, not int
x = np.nan
print(x)
print(type(x))

# Part 2: <br /> Working with CSV files

In [None]:
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df2 = pd.read_csv('data/pandas-test.csv')

In [None]:
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head()

In [None]:
# returns 5 rows by default, or you can pass the number of rows you want as an argument
df2.tail(3)

In [None]:
# you can add a new column to a DataFrame
df2['country'] = ''
df2

In [None]:
# you can update the values of an entire column
df2['country'] = 'USA'
df2

In [None]:
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2

In [None]:
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].str.replace('United States', 'USA')
df2

that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place

In [None]:
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2

In [None]:
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2

In [None]:
# you can save your DataFrame as a csv file
df2.to_csv('data/my_data.csv')

# Part 3:<br />Dropping and editing values

In [None]:
# there are lots of ways to create dataframes
list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]
df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])
df['country'] = 'USA'
df['continent'] = 'North America'
df

In [None]:
# you can remove a column from a dataframe with the .drop() method by referencing its label and axis
# axis 0 = rows
# axis 1 = columns
df2 = df.drop('country', axis=1, inplace=False)
df2

In [None]:
# you can also remove a column from a dataframe with the del() function
df3 = pd.DataFrame(df2)
del(df3['continent'])
df3

del() is a function but .drop() is a method. The difference is that a method is performed on an object, in this case a DataFrame. A function is independent of an object - it's just a chunk of code that is called by name and allows data to be passed to it by arguments.

In [None]:
# you can use the len() function to check the row count of a DataFrame
len(df3)

In [None]:
# you can also use .count() method to check the row count, but this excludes NaNs
df3.count()

In [None]:
# or you can use the .shape attribute to get the shape of the DataFrame
df3.shape

An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()

In [None]:
# you can get a count of values that appear in some column
df3['city'].value_counts()

In [None]:
# you can drop rows that contain duplicate values in some specified column
df4 = df3.drop_duplicates('city')
df4

look at the index above. remember that it's not a row counter, it's an index of row labels

In [None]:
# back to our earlier dataframe
df

In [None]:
# you can perform operations across an entire Series (aka column in our DataFrame) at once
df['year5'] = df['year'] + 5
df