# Pandas

## Importing modules

In [1]:
import pandas as pd
import numpy as np

## Create dataframe

In [15]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', ".", 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, ".", "."],
        'postTestScore': ["25,000", "94,000", 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


## Saving dataframe as csv

In [17]:
df.to_csv('example.csv')b

## Load a CSV

In [20]:
df = pd.read_csv('example.csv')
df

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70


### Without Headers

In [23]:
df = pd.read_csv('example.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,,first_name,last_name,age,preTestScore,postTestScore
1,0.0,Jason,Miller,42,4,25000
2,1.0,Molly,Jacobson,52,24,94000
3,2.0,Tina,.,36,31,57
4,3.0,Jake,Milner,24,.,62
5,4.0,Amy,Cooze,73,.,70


### While specifying column names

In [31]:
df = pd.read_csv('example.csv', names=['UID', 'Pehla naam', 'aakhri naam', 'Umar', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0,UID,Pehla naam,aakhri naam,Umar,Pre-Test Score,Post-Test Score
0,,first_name,last_name,age,preTestScore,postTestScore
1,0.0,Jason,Miller,42,4,25000
2,1.0,Molly,Jacobson,52,24,94000
3,2.0,Tina,.,36,31,57
4,3.0,Jake,Milner,24,.,62
5,4.0,Amy,Cooze,73,.,70


### With Index column to UID

In [27]:
df = pd.read_csv('example.csv', index_col='UID', names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0_level_0,First Name,Last Name,Age,Pre-Test Score,Post-Test Score
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,first_name,last_name,age,preTestScore,postTestScore
0.0,Jason,Miller,42,4,25000
1.0,Molly,Jacobson,52,24,94000
2.0,Tina,.,36,31,57
3.0,Jake,Milner,24,.,62
4.0,Amy,Cooze,73,.,70


### Setting the index columns to First Name and Last name

In [33]:
df = pd.read_csv('example.csv', index_col=['First Name', 'Last Name'], names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,UID,Age,Pre-Test Score,Post-Test Score
First Name,Last Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
first_name,last_name,,age,preTestScore,postTestScore
Jason,Miller,0.0,42,4,25000
Molly,Jacobson,1.0,52,24,94000
Tina,.,2.0,36,31,57
Jake,Milner,3.0,24,.,62
Amy,Cooze,4.0,73,.,70


### Specifying"." as null values

In [41]:
df = pd.read_csv('example.csv', na_values=['.'])
pd.isnull(df)


Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,True,False
4,False,False,False,False,True,False


In [42]:
df = pd.read_csv('example.csv', na_values=['.'])
df

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4.0,25000
1,1,Molly,Jacobson,52,24.0,94000
2,2,Tina,,36,31.0,57
3,3,Jake,Milner,24,,62
4,4,Amy,Cooze,73,,70


### Specifying "." and "NA" as missing values i the Last Name column and "." as missing values in Pre-Test Score Column

In [39]:
sentinels = {'Last Name': ['.', 'NA'], 'Pre-Test Score': ['.']}
df = pd.read_csv('example.csv', na_values=sentinels)
df

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70


### Skipping top 3 rows

In [44]:
df = pd.read_csv('example.csv', na_values=sentinels, skiprows=3)
df

Unnamed: 0,2,Tina,.,36,31,57
0,3,Jake,Milner,24,.,62
1,4,Amy,Cooze,73,.,70


### interpreting "," in strings around numbers as thousan seperators

In [47]:
df = pd.read_csv('example.csv', thousands=',')
df

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70
