# What is Pandas?
python library for data manipulation and analysis

In [3]:
import pandas as pd
data_frame = pd.read_csv('data/friend_list.csv')

# What is DataFrame?
dataframe is a 2-dimensional labeled data structure with columns

In [4]:
data_frame.tail()

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


# What is Series?
Every single column in dataframe is series

In [5]:
type(data_frame.age)

pandas.core.series.Series

In [6]:
data_frame.job = data_frame.job.str.upper()
data_frame.head()

Unnamed: 0,name,age,job
0,John,20,STUDENT
1,Jenny,30,DEVELOPER
2,Nate,30,TEACHER
3,Julia,40,DENTIST
4,Brian,45,MANAGER


**Series** is just wrapper for python list

In [7]:
s1 = pd.core.series.Series(['one', 'two', 'three'])
s2 = pd.core.series.Series([1, 2, 3])
pd.DataFrame(data=dict(word=s1, num=s2))

Unnamed: 0,word,num
0,one,1
1,two,2
2,three,3


# Why Pandas?

Very similar to Excel spreadsheet view,  
support various functions for data manipulation and analysis.  
Fast based on Numpy.  
Easy to manipulate data for your purpose

# Read File to DataFrame
A **Data frame** is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns.

by default, pandas support csv format

In [8]:
df = pd.read_csv('data/friend_list.csv')

In [9]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


you can read txt file like below, if the txt file data are comma separated

In [10]:
df = pd.read_csv('data/friend_list.txt')

In [11]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


if txt file delimiter is not comma, you can use define delimiter using keyword argument

In [12]:
df = pd.read_csv('data/friend_list_tab.txt', delimiter = "\t")

In [13]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


if data file doesn't have header,  
Use header = None like below, so first column not to be your column header

In [18]:
#df = pd.read_csv('data/friend_list_no_head.csv', header = None)
#df = pd.read_csv('data/friend_list_no_head.csv')

In [19]:
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


you can add column header after you create dataframe

In [20]:
df.columns = ['name', 'age', 'job']

In [21]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


you can create column header for no header data at once

In [22]:
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])

In [23]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


# Create DataFrame
when you want to create dataframe from your python code

## from dictionary

In [24]:
friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list)

In [25]:
df.head()

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,developer
2,Nate,30,teacher


if you need fixed column order, you can adjust column order like below,

In [30]:
df = df[['name', 'age', 'job']]

In [31]:
df.head()

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## from OrderedDict
OrderedDict helps you to have fixed column order at once

In [32]:
from collections import OrderedDict

In [33]:
friend_ordered_dict = OrderedDict([ ('name', ['John', 'Jenny', 'Nate']),
          ('age', [20, 30, 30]),
          ('job', ['student', 'developer', 'teacher']) ] )
df = pd.DataFrame.from_dict(friend_ordered_dict)

In [34]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## from list

In [40]:
friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]
column_name = ['name', 'age', 'job']
df = pd.DataFrame.from_records(friend_list, columns=column_name)

In [41]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [66]:
friend_list = [ 
                ['name',['John', 'Jenny', 'Nate']],
                ['age',[20,30,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]
df = pd.DataFrame.from_dict(dict(friend_list))

In [67]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


# Write DataFrame to File

here is one dataframe example with header

In [64]:
friend_list = [ 
                ['name',['John', 'Jenny', 'nate']],
                ['age',[20,30,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]

df = pd.DataFrame.from_dict(dict(friend_list))

In [65]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,nate,30,teacher


you can create csv file using below command,

In [49]:
df.to_csv('friend_list_from_df.csv')

below is one example of dataframe **doesn't** have header

In [50]:
friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]
df = pd.DataFrame.from_records(friend_list)

In [51]:
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


you can write csv file using below command,

In [52]:
df.to_csv('friend_list_from_df.csv')

you also can write txt file using same command

In [53]:
df.to_csv('friend_list_from_df.txt')

by default, header and index are True like below, even if you don't mention it in the command

In [56]:
df.to_csv('friend_list_from_df.csv', header = True, index = True)

**header = False** means you don't want to create column names. no 0,1,2 at column name   
**index = False** means you don't want to create row names.  no 0,1,2 at row name

In [55]:
df.to_csv('friend_list_from_df.csv', header = False, index = False)

you can specify add column names by giving **header** with list

In [None]:
df.to_csv('friend_list_from_df.csv', header = ['name', 'age', 'job'])

below is dataframe has **None** value

In [62]:
friend_list = [ 
                ['name',['John', None, 'nate']],
                ['age',[20,None,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]
df = pd.DataFrame.from_dict(dict(friend_list))

In [63]:
df.head()

Unnamed: 0,name,age,job
0,John,20.0,student
1,,,developer
2,nate,30.0,teacher


In [None]:
df.to_csv('friend_list_from_df.csv')

**na_rep** replace **None** with provided value

In [None]:
df.to_csv('friend_list_from_df.csv', na_rep = '-')

# Select Row

## by index

In [60]:
friend_list = [ 
                ['name',['John', 'Jenny', 'Nate']],
                ['age',[20,30,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]
df = pd.DataFrame.from_dict(dict(friend_list))

In [61]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


select rows from index 1 to index 2

In [68]:
df[1:3]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


select row index 0 and index 2

In [69]:
df.loc[[0,2]]

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher


In [70]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## by column condition

In [73]:
df_filtered = df[df.age < 25]

In [74]:
df_filtered

Unnamed: 0,name,age,job
0,John,20,student


In [75]:
df_filtered = df.query('age>25')

In [76]:
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [77]:
df_filtered = df[(df.age >25) & (df.name == 'Nate')]

In [78]:
df_filtered

Unnamed: 0,name,age,job
2,Nate,30,teacher


In [79]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


# Filter Column

## by index

In [80]:
friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]
df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


select all rows, from column 0 to column 1

In [81]:
df.iloc[:, 0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,30


select all rows, column 0 and column 2

In [82]:
df.iloc[:,[0,2]]

Unnamed: 0,0,2
0,John,student
1,Jenny,developer
2,Nate,teacher


In [83]:
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## by column name

In [84]:
# you can create column header for no header data at once
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [85]:
df_filtered = df[['name', 'age']]
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [86]:
df.filter(items=['age', 'job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [87]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [93]:
# select columns containing 'a'
df.filter(like='a',axis=1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [98]:
# select columns using regex
df.filter(regex='b$',axis=1)

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


# Drop rows

## by row name (index name)

In [105]:
friend_dict_list = [{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])

In [100]:
df.head()

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,30,teacher


### drop row
dropped result will be shown, but dataframe keeps the dropped row

In [101]:
df.drop(['John', 'Nate'])

Unnamed: 0,age,job
Jenny,30,developer


In [102]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,30,teacher


you can assign the result to dataframe to keep the dropped result like below,

In [103]:
df = df.drop(['John', 'Nate'])

In [106]:
df

Unnamed: 0,age,job
John,20,student
Jenny,30,developer
Nate,30,teacher


### drop row in place
The dropped row will be deleted from dataframe with inplace keyword parameter

In [None]:
friend_dict_list = [{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])

In [None]:
df.drop(['John', 'Nate'], inplace = True)

In [None]:
df

## by row id (index number)

In [None]:
friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list)

In [None]:
df

you can drop rows by its index

In [None]:
df = df.drop(df.index[[0,2]])

In [None]:
df

## By Column value

In [None]:
friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list)
df

In [None]:
df = df[df.age != 30]

In [None]:
df

# Drop column

In [None]:
friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list)
df

In [None]:
df = df.drop('age', axis=1)

In [None]:
df

# Add Column / Update Column

In [None]:
friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])
df

## Add New Column with default value

In [None]:
df['salary'] = 0

In [None]:
df

## Add New Column derived from existing value

In [None]:
friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])
df

## one liner adding column by true or false condition

In [None]:
import numpy as np
df['salary'] = np.where(df['job'] != 'student' , 'yes', 'no')

In [None]:
df

In [None]:
friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},
         {'name': 'Jenny', 'midterm': 85, 'final': 80},
         {'name': 'Nate', 'midterm': 10, 'final': 30}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])
df

## column derived from adding two existing columns

In [None]:
df['total'] = df['midterm'] + df['final']

In [None]:
df

## columm from existing column

In [None]:
df['average'] = df['total'] / 2

In [None]:
df

## column by conditional condition

In [None]:
grades = []

for row in df['average']:
    if row >= 90:
        grades.append('A')
    elif row >= 80:
        grades.append('B')
    elif row >= 70:
        grades.append('C')
    else:
        grades.append('F')
        
df['grade'] = grades

In [None]:
df

## how to use apply function
apply function helps you code concisely.
the function will be applied to selected column(s) on all rows

In [None]:
def pass_or_fail(row):
    print(row)
    if row != "F":
        return 'Pass'
    else:
        return 'Fail'

In [None]:
df.grade = df.grade.apply(pass_or_fail)

In [None]:
df

## info extraction using df.apply

In [None]:
date_list = [{'yyyy-mm-dd': '2000-06-27'},
         {'yyyy-mm-dd': '2002-09-24'},
         {'yyyy-mm-dd': '2005-12-20'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])
df

In [None]:
def extract_year(row):
    return row.split('-')[0]

In [None]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)

In [None]:
df

## passing keyword parameter to apply function
you also can send parameter to apply function

In [None]:
def extract_year(year, current_year):
    return current_year - int(year)

In [None]:
df['age'] = df['year'].apply(extract_year, current_year=2018)

In [None]:
df

## passing multiple keyword parameter to apply function
you also can send multiple parameter to apply function

In [None]:
def get_introduce(age, prefix, suffix):
    return prefix + str(age) + suffix

In [None]:
df['introduce'] = df['age'].apply(get_introduce, prefix="I am ", suffix=" years old")

In [None]:
df

## passing multiple columns to apply function
you can provide axis=1 in the apply function, so you send all column values to apply function

In [None]:
def get_introduce2(row):
    return "I was born in "+str(row.year)+" my age is "+str(row.age)
df.introduce = df.apply(get_introduce2, axis=1)

df

## how to use map function
if you give function as parameter, it works same as apply function on the column

In [None]:
date_list = [{'yyyy-mm-dd': '2000-06-27'},
         {'yyyy-mm-dd': '2002-09-24'},
         {'yyyy-mm-dd': '2005-12-20'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])
df

In [None]:
def extract_year(row):
    return row.split('-')[0]

In [None]:
df['year'] = df['yyyy-mm-dd'].map(extract_year)
df

if you give dictionary as parameter,  
column will be updated with new value like  
new value = dict['old value']

In [None]:
job_list = [{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(job_list)
df

In [None]:
df.job = df.job.map({"student":1,"developer":2,"teacher":3})
df

## Applymap
update all elements in the dataframe at once

In [None]:
x_y = [{'x': 5.5, 'y': -5.6},
         {'x': -5.2, 'y': 5.5},
         {'x': -1.6, 'y': -4.5}]
df = pd.DataFrame(x_y)
df

In [None]:
df = df.applymap(np.around)
df

# Add Row

In [None]:
friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},
         {'name': 'Jenny', 'midterm': 85, 'final': 80},
         {'name': 'Nate', 'midterm': 10, 'final': 30}]
df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])
df

In [None]:
df2 = pd.DataFrame([['Ben', 50,50]], columns = ['name', 'midterm', 'final'])

In [None]:
df2.head()

In [None]:
df.append(df2, ignore_index=True)

# Group by
group by command helps to get more information from given data

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

In [None]:
groupby_major = df.groupby('major')

In [None]:
groupby_major.groups

here we can see, computer science has mostly man, while economic has mostly woman students

In [None]:
for name, group in groupby_major:
    print(name + ": " + str(len(group)))
    print(group)
    print()

### group object to dataframe

In [None]:
df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()
df_major_cnt

In [None]:
groupby_sex = df.groupby('sex')

here we can see, this school has balanced woman and man ratio

In [None]:
for name, group in groupby_sex:
    print(name + ": " + str(len(group)))
    print(group)
    print()

In [None]:
df_sex_cnt = pd.DataFrame({'count' : groupby_sex.size()}).reset_index()
df_sex_cnt

# Drop Duplicate
sometimes you need to drop duplicate rows and here is elegant way to to it

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"},
                {'name': 'John', 'major': "Computer Science", 'sex': "male"},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

## check if there is duplicated row

In [None]:
df.duplicated()

In [None]:
df = df.drop_duplicates()

In [None]:
df

In [None]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Nate', 'major': None, 'sex': "male"},
                {'name': 'John', 'major': "Computer Science", 'sex': None},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

In [None]:
df.duplicated(['name'])

In [None]:
df.drop_duplicates(['name'], keep='last')

In [None]:
df

# how to manage None value?

In [None]:
school_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
                {'name': 'Nate', 'job': "teacher", 'age': 35},
                {'name': 'Yuna', 'job': "teacher", 'age': 37},
                {'name': 'Abraham', 'job': "student", 'age': 10},
                {'name': 'Brian', 'job': "student", 'age': 12},
                {'name': 'Janny', 'job': "student", 'age': 11},
                {'name': 'Nate', 'job': "teacher", 'age': None},
                {'name': 'John', 'job': "student", 'age': None}
         ]
df = pd.DataFrame(school_id_list, columns = ['name', 'job', 'age'])
df

## how to check if there is Null or NaN

In [None]:
df.info()

In [None]:
df.isna()

In [None]:
df.isnull()

## how to fill Null or NaN

In [None]:
tmp = df
tmp["age"] = tmp["age"].fillna(0)
tmp

In [None]:
# fill missing age with median age for each group (teacher, student)
df["age"].fillna(df.groupby("job")["age"].transform("median"), inplace=True)

In [None]:
df

# Unique

In [None]:
job_list = [{'name': 'John', 'job': "teacher"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Fred', 'job': "teacher"},
                {'name': 'Abraham', 'job': "student"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Janny', 'job': "developer"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Obrian', 'job': "dentist"},
                {'name': 'Yuna', 'job': "teacher"},
                {'name': 'Rob', 'job': "lawyer"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Matt', 'job': "student"},
                {'name': 'Wendy', 'job': "banker"},
                {'name': 'Edward', 'job': "teacher"},
                {'name': 'Ian', 'job': "teacher"},
                {'name': 'Chris', 'job': "banker"},
                {'name': 'Philip', 'job': "lawyer"},
                {'name': 'Janny', 'job': "basketball player"},
                {'name': 'Gwen', 'job': "teacher"},
                {'name': 'Jessy', 'job': "student"}
         ]
df = pd.DataFrame(job_list, columns = ['name', 'job'])

unique() gives you unique values of the column in list format

In [None]:
print( df.job.unique() )

value_counts() gives you the number of item for each unique columns

In [None]:
df.job.value_counts()

# Concatenate two dataframe

In [None]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Fred', 'job': "developer"}]

l2 = [{'name': 'Ed', 'job': "dentist"},
      {'name': 'Jack', 'job': "farmer"},
      {'name': 'Ted', 'job': "designer"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])

## pd.concat
below is to add second dataframe as new rows in first dataframe

In [None]:
frames = [df1, df2]
result = pd.concat(frames, ignore_index=True)

In [None]:
result

## df.append
below is to add second dataframe as new rows in first dataframe

In [None]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Fred', 'job': "developer"}]

l2 = [{'name': 'Ed', 'job': "dentist"},
      {'name': 'Jack', 'job': "farmer"},
      {'name': 'Ted', 'job': "designer"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])
result = df1.append(df2, ignore_index=True)

In [None]:
result

## pd.concat
below is to add second dataframe as new columns in first dataframe

In [None]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Jack', 'job': "developer"}]

l2 = [{'age': 25, 'country': "U.S"},
      {'age': 30, 'country': "U.K"},
      {'age': 45, 'country': "Korea"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['age', 'country'])
result = pd.concat([df1, df2], axis=1, ignore_index=True)

In [None]:
result

# Concatenate two list as a dataframe

In [None]:
label = [1,2,3,4,5]
prediction = [1,2,2,5,5]

comparison = pd.DataFrame(
    {'label': label,
     'prediction': prediction
    })

comparison