# Pandas: Data Manipulation and Analysis in Python

In [2]:
#Make sure everyone has pandas installed, either with anaconda, or with pip
import pandas as pd
import numpy as np

### Getting the Data

* Pandas handles Excel, CSV, SQL-Tables, Fixed Width

In [3]:
df = pd.read_excel("Data.xlsx")

### DataFrames
* Main data structure in Pandas
* A dataFrame in Pandas is made up of a collection of 1d columns called 'Series'
* Similar to a SQL DB, or a data.frame in R

In [4]:
df.head()

Unnamed: 0,form,section_NUM,item,type,correct,incorrect,omits,Total
0,K-50SA10,1,1,MC,80,12,8.0,100
1,K-50SA10,3,1,SPR,65,30,5.0,100
2,K-50SB07,1,1,MC,95,5,,100
3,K-50SC07,2,1,,70,28,2.0,100
4,K-50SB07,1,1,MC,50,37,13.0,100


In [5]:
print(df.shape)
print(df.info())
# Pandas can infer the schema AND data types
# Note that the 'omits' column is of floating point type

(16, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 8 columns):
form           16 non-null object
section_NUM    16 non-null int64
item           16 non-null int64
type           14 non-null object
correct        16 non-null int64
incorrect      16 non-null int64
omits          14 non-null float64
Total          16 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 1.1+ KB
None


#### Data Exploration

In [6]:
print(df.describe())
print(df.mean())

In [7]:
print(df['form'].unique())
print(df['form'].value_counts())
# Access column names using df['column'] or df.column

['K-50SA10' 'K-50SB07' 'K-50SC07' '5MSA11']
K-50SA10    4
K-50SB07    4
K-50SC07    4
5MSA11      4
Name: form, dtype: int64


In [8]:
df.rename(columns={'section_NUM':'section'}).head(3)
#You can also add new columns to the dataFrame

Unnamed: 0,form,section,item,type,correct,incorrect,omits,Total
0,K-50SA10,1,1,MC,80,12,8.0,100
1,K-50SA10,3,1,SPR,65,30,5.0,100
2,K-50SB07,1,1,MC,95,5,,100


### Querying/Subsetting a DataFrame
* Similar to SQL (SELECT, FROM, WHERE)

#### Selecting columns

In [13]:
subset = df[['correct','incorrect']].head(3)
subset

Unnamed: 0,correct,incorrect
0,80,12
1,65,30
2,95,5


#### Selecting rows

In [33]:
subset1 = subset[subset['correct']>=80]
print(subset1)
#print(subset1.reset_index(drop=True))

   correct  incorrect
0       80         12
2       95          5


In [35]:
print(df[(df['type'] == 'MC') & (df['omits']<=10)].reset_index(drop=True))

       form  section_NUM  item type  correct  incorrect  omits  Total
0  K-50SA10            1     1   MC       80         12    8.0    100
1    5MSA11            2     1   MC       75         15   10.0    100
2  K-50SA10            1     2   MC       55         40    5.0    100
3  K-50SB07            1     2   MC       70         28    2.0    100
4    5MSA11            2     2   MC       80         15    5.0    100


#### Aggregate queries with 'groupby'

In [42]:
df[['form','correct', 'incorrect']].groupby('form').mean()

Unnamed: 0_level_0,correct,incorrect
form,Unnamed: 1_level_1,Unnamed: 2_level_1
5MSA11,57.5,30.0
K-50SA10,56.25,35.5
K-50SB07,75.0,21.25
K-50SC07,62.5,28.0
