# Pandas Tutorial 
### Pandas is a Python library commonly used for analyzing, filtering and manipulating data, especially tabular ("table like") data

In [0]:
import pandas as pd
from google.colab import files
import numpy as np
import matplotlib.pyplot as plt

### Dataset for tutorial and problem set:

    Pima Indian Heritage Diabetes

    Each person in the dataset is a female who is at least 21 years
    
  #### Below are the variables for each person in the dataset
      'preg': number of pregnancies  
      'plas': plasma glucose concentration 
      'pres': blood pressure 
      'skin': skin thickness
      'test': Insluin
      'mass': BMI
      'pedi': diabetes pedigree function
      'age': age
      'class': '0' means does not have diabetes and '1' means has diabetes

Go to the link below to see the raw comma seperated values (csv) data:

https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv


#### Read csv with pandas

In [0]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(url, names=names)

In [4]:
type(data)  # dataframe is Pandas primary data structure|

pandas.core.frame.DataFrame

In [5]:
data.head(4)  # get a peak of the dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [6]:
data.shape   # tuple of the dimensionality, just like numpy 

(768, 9)

In [7]:
data.count()  # there are several functions with dataframes, you can explore the documentation and use tab complete to see other functions and variables associated with dataframes

preg     768
plas     768
pres     768
skin     768
test     768
mass     768
pedi     768
age      768
class    768
dtype: int64

In [8]:
"There are " + str(data.shape[0]) + " people in the study with " + str(data.shape[1]) + " variables each "

'There are 768 people in the study with 9 variables each '

In [9]:
data.describe()  # a dataframe function which provides the basic seen statistics below

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#### Slicing

 iloc: indexing with integers similar to numpy 
 basic format: i:j:k where i is the starting index, j is the stopping index, and k is the step

In [10]:
data.iloc[0:3]  # equivalent to data.iloc[0:3:1]

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [11]:
# can index columns as well
data.iloc[0:3, 7:]  # equivalent to data.iloc[0:3:1, 7:9:1]

Unnamed: 0,age,class
0,50,1
1,31,0
2,32,1


 loc: indexing mainly with labels  
same basic i:j:k format but with labels

In [12]:
data.loc[0:2, 'pres':'class']  # don't be fooled by the 0:2 this is because the labels are integers and not because it is indexing like .iloc

Unnamed: 0,pres,skin,test,mass,pedi,age,class
0,72,35,0,33.6,0.627,50,1
1,66,29,0,26.6,0.351,31,0
2,64,0,0,23.3,0.672,32,1


#### Chained Indexing/Masking

In [0]:
data_age_boolean = data.age > 39  # creates a Pandas series which has a True or False vlaue for each row in the dataframe depending on the column's age

In [14]:
data['age'].head(3)  # only first value is greater than 39

0    50
1    31
2    32
Name: age, dtype: int64

In [15]:
data_age_boolean.head(3)  # only first value is greater than 39

0     True
1    False
2    False
Name: age, dtype: bool

In [16]:
data_age_boolean.count()  # same size as data dataframe

768

In [0]:
filtered_dataframe_by_age = data[data['age'] > 39]

In [18]:
str(len(filtered_dataframe_by_age)) + " people are older than 39 in this study"

'207 people are older than 39 in this study'

In [19]:
data[(data['age'] > 39) & (data['preg'] > 2)].head(3)  # use '|' instead of & for an or operation 

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
9,8,125,96,0,0,0.0,0.232,54,1
12,10,139,80,0,0,27.1,1.441,57,0


In [20]:
str(len(data[(data['age']> 39) & (data['preg'] > 2)])) + " people are older than 39 and have had more than 2 pregnancies in this study"

'172 people are older than 39 and have had more than 2 pregnancies in this study'

#### Accounting for missing values

There are a few simple ways to handle missing values in Pandas: 


*   Delete rows with missing values
*   Fill missing values with appropriate or neutral value
* NaN in pandas stands for not a number and indicates missing entries



The appropriate way of handling missing values depends on the context, so it is important to have all the methods listed above available in your coding arsenal

In [21]:
example_data = pd.DataFrame( {'col1': [1, None, 2], 'col2': [3, 4, np.nan]})  # pandas automatically converts None to NaN
example_data

Unnamed: 0,col1,col2
0,1.0,3.0
1,,4.0
2,2.0,


In [0]:
example_data['col1'].mean()  # handles as you would expect and ignores missing value

1.5

In [0]:
example_data['col1'].max() # handles as you would expect and ignores missing value

2.0

In [0]:
example_data.dropna()

Unnamed: 0,col1,col2
0,1.0,3.0


In [0]:
example_data.fillna(-1)  # fill with appropriate value

Unnamed: 0,col1,col2
0,1.0,3.0
1,-1.0,4.0
2,2.0,-1.0
