# Data Manipulation in pandas
This notebook is all about editing pandas dataframes.

In [14]:
import pandas as pd
import numpy as np

# we will use the famous gapminder dataset
from gapminder import gapminder
df = gapminder

In [15]:
# lets preview the original dataframe
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


Rename columns

In [16]:
df = df.rename(columns={'lifeExp': 'Life Expectancy'})

add new column (we put a static value in here for demonstration purposes)

In [17]:
df['new_column'] = 1

df.head()

Unnamed: 0,country,continent,year,Life Expectancy,pop,gdpPercap,new_column
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,1
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,1
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,1
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,1
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,1


Drop / delete columns

In [18]:
df = df.drop('new_column', axis=1)

# we can also delete a column with "del"
df['new_column2'] = 1
del df['new_column2']

## Handling missing values

Convert some character to NaN, here we convert all '?' to NaN

In [21]:
# we are creating a new column with ? and convert it to nan
df['new_column'] = '?'

df[df == '?'] = np.nan

df.head()

Unnamed: 0,country,continent,year,Life Expectancy,pop,gdpPercap,new_column
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,


Print the number of all NaNs

In [22]:
print(df.isnull().sum())

country               0
continent             0
year                  0
Life Expectancy       0
pop                   0
gdpPercap             0
new_column         1704
dtype: int64


Prints true or false for all columns if there are empty values in it

In [23]:
df.isna().any()

country            False
continent          False
year               False
Life Expectancy    False
pop                False
gdpPercap          False
new_column          True
dtype: bool

Drop missing values

In [24]:
df = df.dropna()

Drops all rows with specfic value

In [26]:
df = df[(df['new_column'] != '?')]
df.head()

Unnamed: 0,country,continent,year,Life Expectancy,pop,gdpPercap,new_column


Lets get our standard dataset back.

In [64]:
df = gapminder
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


group by column values and perform function, in this case calculate mean

In [73]:
df_grouped = df.groupby(['continent', 'country']).mean()

In [75]:
df_grouped.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,year,lifeExp,pop,gdpPercap
continent,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,Algeria,1979.5,59.030167,19875410.0,4426.025973
Africa,Angola,1979.5,37.8835,7309390.0,3607.100529
Africa,Benin,1979.5,48.779917,4017497.0,1155.395107
Africa,Botswana,1979.5,54.5975,971186.2,5031.503557
Africa,Burkina Faso,1979.5,44.694,7548677.0,843.990665
