
# What is Pandas?
Pandas is a python library that can be used to analyze and manipulate data.

Extensive support libraries (NumPy for numerical calculations, Pandas for data analytics, etc.)
## Import pandas library

In [2]:
import pandas as pd

## Series
A one-dimentional array capable of holding any data type. It is like a column in a database table

In [2]:
pd.Series(['Seoul', 'Incheon', 'Busan'])

0      Seoul
1    Incheon
2      Busan
dtype: object

## DataFrame
A two-dimentional data structure with data and label or rows and columns.

In [3]:
cities = pd.Series(['Seoul', 'Incheon', 'Busan'])
population = pd.Series([9988000, 2930000, 3429000])

k_data = pd.DataFrame({ 'City': cities, 'Population': population })

### Accessing DataFrame data
Similar to dictionaries, you can access data by adding key.

In [4]:
print('Get all data with City key')
print(k_data["City"])
print('Get data from City key with index of 1')
print(k_data["City"][1])

Get all data with City key
0      Seoul
1    Incheon
2      Busan
Name: City, dtype: object
Get data from City key with index of 1
Incheon


## Read CSV files
CSV also known as comma separated files. It is file that has tabular tabular format.

In [3]:
df = pd.read_csv('src/sk_data.csv')
print(df)

        city      lat       lng      country iso2 admin_name  capital  \
0    Bucheon  37.5000  126.7833  South Korea   KR   Gyeonggi    minor   
1      Busan  35.1800  129.0750  South Korea   KR      Busan    admin   
2   Changwon  35.2708  128.6631  South Korea   KR  Gyeongnam    admin   
3      Daegu  35.8717  128.6017  South Korea   KR      Daegu    admin   
4    Daejeon  36.3510  127.3850  South Korea   KR    Daejeon    admin   
5    Dongnae  35.2016  129.0848  South Korea   KR      Busan    minor   
6     Goyang  37.6500  126.8000  South Korea   KR   Gyeonggi    minor   
7    Gwangju  35.1653  126.8486  South Korea   KR    Gwangju    admin   
8    Incheon  37.4833  126.6333  South Korea   KR    Incheon    admin   
9      Seoul  37.5600  126.9900  South Korea   KR      Seoul  primary   
10     Suwon  37.2667  127.0167  South Korea   KR   Gyeonggi    admin   
11     Ulsan  35.5500  129.3167  South Korea   KR      Ulsan    admin   

    population  population_proper  
0     867678.0

### Access DataFrame Values
To access DataFrame values, we use iterrows() function to iterate rows in the DataFrame.

In [6]:
for index, row in df.iterrows():
    print(row['city'])

Bucheon
Busan
Changwon
Daegu
Daejeon
Dongnae
Goyang
Gwangju
Incheon
Seoul
Suwon
Ulsan


### Renaming columns
To rename a column, pandas library uses rename() function. We use inplace=True to apply changes to the DataFrame.

In [7]:
df.rename(columns = {'city':'city_list'}, inplace = True)

print("After modifying city column: ", df.columns)

After modifying city column:  Index(['city_list', 'lat', 'lng', 'country', 'iso2', 'admin_name', 'capital',
       'population', 'population_proper'],
      dtype='object')


### Filter columns
Filtering is used for finding a specific value the user provided. There are multiple ways to filter. First is filter()

In [8]:
city_pop = df.filter(items=['city_list', 'population'])
print(city_pop)

   city_list  population
0    Bucheon    867678.0
1      Busan   3453198.0
2   Changwon   1046054.0
3      Daegu   2461002.0
4    Daejeon   1475221.0
5    Dongnae         NaN
6     Goyang   1061929.0
7    Gwangju   1490092.0
8    Incheon   2936117.0
9      Seoul  23016000.0
10     Suwon   1234300.0
11     Ulsan   1166033.0


### Filter by column value

In [9]:
pop = df.loc[df['population'] == 867678.0]
print(pop)

  city_list   lat       lng      country iso2 admin_name capital  population  \
0   Bucheon  37.5  126.7833  South Korea   KR   Gyeonggi   minor    867678.0   

   population_proper  
0           792561.0  


### Filter with logical conditions

In [10]:
pop_higher = df.loc[df['population'] > 3453198.0]
print(pop_higher)

  city_list    lat     lng      country iso2 admin_name  capital  population  \
9     Seoul  37.56  126.99  South Korea   KR      Seoul  primary  23016000.0   

   population_proper  
9         10013781.0  


### Filter by missing values

In [11]:
not_null = df[df['population'].notnull()]
print(not_null)

   city_list      lat       lng      country iso2 admin_name  capital  \
0    Bucheon  37.5000  126.7833  South Korea   KR   Gyeonggi    minor   
1      Busan  35.1800  129.0750  South Korea   KR      Busan    admin   
2   Changwon  35.2708  128.6631  South Korea   KR  Gyeongnam    admin   
3      Daegu  35.8717  128.6017  South Korea   KR      Daegu    admin   
4    Daejeon  36.3510  127.3850  South Korea   KR    Daejeon    admin   
6     Goyang  37.6500  126.8000  South Korea   KR   Gyeonggi    minor   
7    Gwangju  35.1653  126.8486  South Korea   KR    Gwangju    admin   
8    Incheon  37.4833  126.6333  South Korea   KR    Incheon    admin   
9      Seoul  37.5600  126.9900  South Korea   KR      Seoul  primary   
10     Suwon  37.2667  127.0167  South Korea   KR   Gyeonggi    admin   
11     Ulsan  35.5500  129.3167  South Korea   KR      Ulsan    admin   

    population  population_proper  
0     867678.0           792561.0  
1    3453198.0          3453198.0  
2    1046054.0 

### Operators in DataFrame

In [12]:
df['population_subtract'] = df['population'] - df['population_proper']
print(df)

   city_list      lat       lng      country iso2 admin_name  capital  \
0    Bucheon  37.5000  126.7833  South Korea   KR   Gyeonggi    minor   
1      Busan  35.1800  129.0750  South Korea   KR      Busan    admin   
2   Changwon  35.2708  128.6631  South Korea   KR  Gyeongnam    admin   
3      Daegu  35.8717  128.6017  South Korea   KR      Daegu    admin   
4    Daejeon  36.3510  127.3850  South Korea   KR    Daejeon    admin   
5    Dongnae  35.2016  129.0848  South Korea   KR      Busan    minor   
6     Goyang  37.6500  126.8000  South Korea   KR   Gyeonggi    minor   
7    Gwangju  35.1653  126.8486  South Korea   KR    Gwangju    admin   
8    Incheon  37.4833  126.6333  South Korea   KR    Incheon    admin   
9      Seoul  37.5600  126.9900  South Korea   KR      Seoul  primary   
10     Suwon  37.2667  127.0167  South Korea   KR   Gyeonggi    admin   
11     Ulsan  35.5500  129.3167  South Korea   KR      Ulsan    admin   

    population  population_proper  population_subt

### Dropping column in DataFrame
drop() function is used to delete columns in a DataFrame

In [13]:
dropped = df.drop(['population_proper'], axis=1)
print(dropped)

   city_list      lat       lng      country iso2 admin_name  capital  \
0    Bucheon  37.5000  126.7833  South Korea   KR   Gyeonggi    minor   
1      Busan  35.1800  129.0750  South Korea   KR      Busan    admin   
2   Changwon  35.2708  128.6631  South Korea   KR  Gyeongnam    admin   
3      Daegu  35.8717  128.6017  South Korea   KR      Daegu    admin   
4    Daejeon  36.3510  127.3850  South Korea   KR    Daejeon    admin   
5    Dongnae  35.2016  129.0848  South Korea   KR      Busan    minor   
6     Goyang  37.6500  126.8000  South Korea   KR   Gyeonggi    minor   
7    Gwangju  35.1653  126.8486  South Korea   KR    Gwangju    admin   
8    Incheon  37.4833  126.6333  South Korea   KR    Incheon    admin   
9      Seoul  37.5600  126.9900  South Korea   KR      Seoul  primary   
10     Suwon  37.2667  127.0167  South Korea   KR   Gyeonggi    admin   
11     Ulsan  35.5500  129.3167  South Korea   KR      Ulsan    admin   

    population  population_subtract  
0     867678

## Group by column name
It is used to group large amount of data and compute operations

In [6]:
admin_pop = df.groupby(by=["admin_name", "population"]).sum()
admin_pop

Unnamed: 0_level_0,Unnamed: 1_level_0,city,lat,lng,country,iso2,capital,population_proper
admin_name,population,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Busan,3453198.0,Busan,35.18,129.075,South Korea,KR,admin,3453198.0
Daegu,2461002.0,Daegu,35.8717,128.6017,South Korea,KR,admin,2444412.0
Daejeon,1475221.0,Daejeon,36.351,127.385,South Korea,KR,admin,1475221.0
Gwangju,1490092.0,Gwangju,35.1653,126.8486,South Korea,KR,admin,1490092.0
Gyeonggi,867678.0,Bucheon,37.5,126.7833,South Korea,KR,minor,792561.0
Gyeonggi,1061929.0,Goyang,37.65,126.8,South Korea,KR,minor,1061929.0
Gyeonggi,1234300.0,Suwon,37.2667,127.0167,South Korea,KR,admin,1234300.0
Gyeongnam,1046054.0,Changwon,35.2708,128.6631,South Korea,KR,admin,1046054.0
Incheon,2936117.0,Incheon,37.4833,126.6333,South Korea,KR,admin,2936117.0
Seoul,23016000.0,Seoul,37.56,126.99,South Korea,KR,primary,10013781.0


### Concatenate values in DataFrame
concat() function is used in pandas to concatenate or link data together

In [14]:
df1 = pd.DataFrame([['John', 25], ['Joe', 23]],
                   columns=['Name', 'Age'])
df2 = pd.DataFrame([['Mary', 20], ['Anna', 21]],
                   columns=['Name', 'Age'])

print(df1)
print(df2)
concat_val = pd.concat([df1, df2], ignore_index=True)
print(concat_val)

   Name  Age
0  John   25
1   Joe   23
   Name  Age
0  Mary   20
1  Anna   21
   Name  Age
0  John   25
1   Joe   23
2  Mary   20
3  Anna   21


### Merge values in DataFrame
merge() function is used in pandas to combine common columns in a DataFrame.
* inner
* right
* outer
* inner

In [15]:
df_1 = pd.DataFrame([['John', 25], ['Joe', 23]],
                   columns=['Name', 'Age'])
df_2 = pd.DataFrame([['John', 20], ['Anna', 21]],
                   columns=['Name', 'Age'])

print(df_1)
print(df_2)
inner = df_1.merge(df_2, on='Name', how='inner')
print('inner')
print(inner)
left = df_1.merge(df_2, on='Name', how='left')
print('left')
print(left)
right = df_1.merge(df_2, on='Name', how='right')
print('right')
print(right)
outer = df_1.merge(df_2, on='Name', how='outer')
print('outer')
print(outer)

   Name  Age
0  John   25
1   Joe   23
   Name  Age
0  John   20
1  Anna   21
inner
   Name  Age_x  Age_y
0  John     25     20
left
   Name  Age_x  Age_y
0  John     25   20.0
1   Joe     23    NaN
right
   Name  Age_x  Age_y
0  John   25.0     20
1  Anna    NaN     21
outer
   Name  Age_x  Age_y
0  John   25.0   20.0
1   Joe   23.0    NaN
2  Anna    NaN   21.0


### Join values in DataFrame
join() function is used in pandas to insert columns from another DataFrame.

In [16]:
outer_join = df_1.join(df_2, lsuffix="_left", rsuffix="_right", how='outer')
print('outer')
print(outer_join)

outer
  Name_left  Age_left Name_right  Age_right
0      John        25       John         20
1       Joe        23       Anna         21
