# Pandas Introduction

In [1]:
import numpy as np
import pandas as pd

## Series object

In [2]:
series_obj = pd.Series([31,22,43,44,55])
series_obj

0    31
1    22
2    43
3    44
4    55
dtype: int64

In [3]:
series_obj[0]

31

In [4]:
series_obj = pd.Series([31,22,43,44,55], index = ['a', 'b', 'c', 'd', 'e'], name = "Column_1")
# OR obj.index = ['a', 'b', 'c', 'd', 'e']
#    series_obj.name = "Column_1"
series_obj

a    31
b    22
c    43
d    44
e    55
Name: Column_1, dtype: int64

In [5]:
series_obj['a']   # like a dictionary

31

In [6]:
series_obj*2

a     62
b     44
c     86
d     88
e    110
Name: Column_1, dtype: int64

In [7]:
over_35 = series_obj>35
over_35

a    False
b    False
c     True
d     True
e     True
Name: Column_1, dtype: bool

In [8]:
series_obj[over_35]

c    43
d    44
e    55
Name: Column_1, dtype: int64

## DataFrame object
### **Create a DataFrame using a dictionary**

In [9]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee"], 
        "Gender": ["Male", "Female", "Female"],
        "Age": [32, 44, 21]}
df = pd.DataFrame(data)
# print(df)  #does not display as an HTML table
df

Unnamed: 0,Name,Gender,Age
0,Tim Miller,Male,32
1,Ann Carter,Female,44
2,Ellen Lee,Female,21


In [10]:
df.head() # == df.head(5)

Unnamed: 0,Name,Gender,Age
0,Tim Miller,Male,32
1,Ann Carter,Female,44
2,Ellen Lee,Female,21


In [11]:
df.tail()  # == df.tail(5)

Unnamed: 0,Name,Gender,Age
0,Tim Miller,Male,32
1,Ann Carter,Female,44
2,Ellen Lee,Female,21


In [12]:
# Series object
df['Name']     # dictionary notation

0    Tim Miller
1    Ann Carter
2     Ellen Lee
Name: Name, dtype: object

In [13]:
df.Name     # attribute notation; Tab completion

0    Tim Miller
1    Ann Carter
2     Ellen Lee
Name: Name, dtype: object

In [14]:
# assignment by column
df["Birth Year"] = 1999
df["Birth Year"]

0    1999
1    1999
2    1999
Name: Birth Year, dtype: int64

In [15]:
# add a column
df["Married"] = ['Yes', 'Yes', 'No']     # must match the length/index of the DataFrame
df

Unnamed: 0,Name,Gender,Age,Birth Year,Married
0,Tim Miller,Male,32,1999,Yes
1,Ann Carter,Female,44,1999,Yes
2,Ellen Lee,Female,21,1999,No


In [16]:
df["Married"] = 'Yes'
df

Unnamed: 0,Name,Gender,Age,Birth Year,Married
0,Tim Miller,Male,32,1999,Yes
1,Ann Carter,Female,44,1999,Yes
2,Ellen Lee,Female,21,1999,Yes


### **Create a DataFrame using a numpy array**

In [17]:
# create a DataFrame using an array
data = pd.DataFrame(np.arange(16).reshape(4,4), columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [None]:
data.drop("three", axis = 1)   # OR  axis = 'columns'
                            # returns  new object, doesn't change original

In [None]:
data     # unchanged

In [None]:
data.drop('three', axis = 1, inplace = True) # if you want it to change the original object then drop 'inplace'
                                         # destroys any data that is dropped

In [None]:
data     # changed

## Selection and Filtering
### Select Columns

In [None]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
data

In [None]:
data['a']

In [None]:
data[["a", "e", "j"]]    # providing a single value or a list selects columns

In [None]:
# boolean selection
data[data["j"] > 40]

### Select rows

In [None]:
data[:1]     # use slice syntax to select rows

In [None]:
data[5:9]

### Selection with loc and iloc
Allows you to select a subset of the rows and columns

In [None]:
data

In [None]:
# loc implies the name/label of the row and column
data.loc[1:5,"b"]

In [None]:

data.loc[:5, 'a':'e']     # consecutive (loc selection is inclusive)

In [None]:
data.loc[:5, ['c', 'f', 'i']]     # not consecutive

In [None]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

In [None]:
data.iloc[4]  # gives you a row, assumes all of the columns

In [None]:
data.iloc[[5, 0, 3], [9, 5, 0]]    # returns them in the order listed

In [None]:
df.columns   # returns the names of the columns within the DataFrame

### Descriptive and summary statistics

In [None]:
# import the iris data
iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "class"])

In [None]:
iris_data.head()  # head() returns the first 5 columns by default; tail() returns the botom 5

In [None]:
iris_data.describe()

In [None]:
iris_data.columns

In [None]:
iris_data["class"]

In [None]:
set(iris_data["class"])

In [None]:
iris_data["class"].describe()      # non-numerical data

In [None]:
# some Descriptive and Summary statistics  (min, max, idxmin, idxmax, mean, median, std, count, corr)
iris_data.min()   

In [None]:
iris_data["sepal_l"].head(20)

In [None]:
# .count() will return the number of items in an object
iris_data.loc[iris_data['sepal_l'] > 4.9, "sepal_l"].count()

In [None]:
# you can also use the built-in python function len() to return the length of an object
len(iris_data[iris_data['sepal_l'] > 4.9])

In [None]:
# use unique to get the number of unique items within a Series/column.
iris_data["class"].unique()

In [None]:
# use value_counts() to get the quantity of each unique item within a Series/column.
iris_data["class"].value_counts()

### Transforming and Cleaning Data

In [None]:
# create a new DataFrame
data = pd.DataFrame({'age': [0, 26, 41, 0], 'gender': ["Male", "Female", "Female", "Female"]})
data

In [None]:
# Transform categorical variables into binary (discreet) variables
# map enables convenient element-wise transformations

data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})
data

In [None]:
data['gender'].sum()

In [None]:
data.mean(axis=0)

In [None]:
# replace values 
# nan ("not a number") values are not used in calculating the mean, etc.
 
data['age'] = data['age'].replace(0, np.nan)
data

# replace provides flexibility
# can also pass a list of multiple values to replace (e.g., replace([0, -1], np.nan)
# can provide a different replacement for each value (e.g., replace([0, -1], [np.nan, 1])

In [None]:
data.mean()  # axis = 0 is the default

In [None]:
# print formatting
# the number preceding the colon is the index position (0 is the default) within the format() tuple 
        # of the string that you want printed.
# following the colon is the formatting instructions: 
        # how many places following the decimal should be printed for the float.
    
print("The mean age is: {0:.1f}".format(data["age"].mean(), 99))

### Null (NaN) values

In [None]:
# nan can be used as a sentinel to drop or impute/replace a value

data['age'].isnull()    # notnull()

In [None]:
mask = data['age'].notnull() 
mask

In [None]:
data.loc[mask, 'age']

### Impute missing values

In [None]:
data.loc[data['age'].isnull(),'age'] = data['age'].mean()
data

### Boolean Selection 

In [None]:
iris_data.shape

In [None]:
iris_data[iris_data['sepal_w'] < 3]

In [None]:
# isin() is a boolean check to see if items within a columns are included in a list
iris_data["class"].isin(['Iris-setosa'])

In [None]:
# include only the items that are included within a given list of items
mask = iris_data["class"].isin(['Iris-setosa', 'Iris-virginica'])
iris_data[mask]

## Boolean selection within pandas

In [18]:
# And
iris_data.loc[(iris_data['sepal_w'] < 3) & (iris_data['sepal_l'] > 5)].head()

NameError: name 'iris_data' is not defined

In [None]:
# Following the selection of rows, list the columns to show.

iris_data.loc[(iris_data['sepal_w'] < 3) & (iris_data['sepal_l'] > 5), ['sepal_w','class']].head() # [:5] 

In [None]:
# Or
iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5)]

In [None]:
# Following the selection of rows, list the series of columns to show (inclusive).

iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5), 'sepal_l':'petal_w']


In [None]:
# return only the values, not the DataFrame
   # capital X usually indicates all of the features that the algorithm will be given to learn from

X = iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5), 'sepal_l':'petal_w'].values
X

In [None]:
# to return just the unique values use set() or .unique()

set(iris_data['class'])

In [None]:
iris_data['class'].unique()

In [None]:
# lowercase y usually indicates the classifications that you would like to predict.
    # Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).


y = iris_data['class'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
y

## Archive cleaned and transformed DataFrame

In [None]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")