# What is pandas? 
pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

In [None]:
import pandas as pd
import numpy as np

# Data structures in pandas
- Series - "column"
- DataFrame - "table"

In [None]:
name = ["Ah Meng", "Abu", "Ahmad", "Siva", "Ah Kau"]
grades = ["A", "B", "C", "A", "A"]
name_series = pd.Series(name)
grades_series = pd.Series(grades)
student_df = pd.DataFrame({"name": name_series, "grade": grades_series})
student_df


# Index
- To reference the row
-  By default the index is numeric starting from 0
- Able to set which series to be the index

In [None]:
student_df_name_index = student_df.set_index("name")
student_df_name_index

# Locating data
- “Coordinate system” for DataFrame
By referencing the row and column, we can get to the data that we wanted
- Could be referenced by numeric index, or word keys
- df.iloc and df.loc


In [None]:
ahmeng_grade = student_df.iloc[0, 1] # A
abu_grade = student_df.iloc[1, 1] # B+
all_grades = student_df.iloc[:, 1]
print("Ah Meng's grade", ahmeng_grade)
print("Abu's grade", abu_grade)
print("All Grades")
print(all_grades)

In [None]:
print(ahmeng_grade == student_df.loc[0, 'grade'])
print(abu_grade == student_df.loc[1, 'grade'])
print(all_grades == student_df.loc[:, 'grade'])

In [None]:
student_with_a = student_df[student_df.grade == 'A']
print(student_df.grade == 'A')
student_with_a

## Difference of iloc and loc

In [None]:
ahmeng_to_ahmad = student_df.iloc[0:3, 0]
ahmeng_to_ahmad

In [None]:
ahmeng_to_ahmad_take_two = student_df.loc[0:2, "name"]
ahmeng_to_ahmad_take_two

# Reading from a datasource
- Get data from csv, sql or json files
- pd.read_csv

In [None]:
california_housing = pd.read_csv("sample_data/california_housing_test.csv")
california_housing

# Adding and dropping columns
- Adding new columns is similar to adding a new key-value pair in dict
- Dropping columns with pd drop function

In [None]:
student_df["passed"] = student_df.grade == "A"

student_df

In [None]:
student_df = student_df.drop("passed", axis=1)
student_df

In [None]:
student_df = student_df.drop(0, axis=0)
student_df

# Merging dataframes

In [None]:
df_a = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
df_b = pd.DataFrame({"b": [3, 4, 5], "c": [1, 2, 3]})

In [None]:
pd.concat([df_a, df_b], axis=1)

In [None]:
df_a.merge(df_b, on="b")

# Too many rows
- Get the first n rows or last n rows
- head, tail

In [None]:
california_housing.shape

In [None]:
california_housing.head(3)

In [None]:
california_housing.tail()

# Data exploration
- df.info
- df.describe

In [None]:
california_housing.info()

In [None]:
california_housing.describe()

In [None]:
california_housing.corr(method='pearson')

In [None]:
student_df.grade.unique()

In [None]:
student_df.grade.value_counts()

# Missing Values

In [None]:
student_df["mark"] = [96, 79, 68, np.nan]
student_df

In [None]:
student_df.dropna()

In [None]:
student_df.fillna(-1)

# Aggregation

In [None]:
student_df.groupby("grade").count()

In [None]:
for idx, gp in student_df.groupby("grade"):
  print(idx, gp);

# Exporting datasets

In [None]:
student_df.to_csv("sample_data/student_data.csv")

In [None]:
student_df.to_dict()

# Further readings and learning
- Kaggle tutorial - https://www.kaggle.com/learn/pandas
- pandas documentation - https://pandas.pydata.org/
- pandas cheatsheet [here](http://datacamp-community-prod.s3.amazonaws.com/f04456d7-8e61-482f-9cc9-da6f7f25fc9b) by datacamp.com