# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'Name' : {0: 'John', 1: 'Bob', 2: 'Sheila'},
    'Course' : {0: 'Masters', 1: 'Graduate', 2: 'Graduate'},
    'Age' : {0: 27, 1: 23, 2: 21},
})

pd.melt(df, id_vars =['Name'], value_vars =['Course','Age'])

Unnamed: 0,Name,variable,value
0,John,Course,Masters
1,Bob,Course,Graduate
2,Sheila,Course,Graduate
3,John,Age,27
4,Bob,Age,23
5,Sheila,Age,21


## Rename Columns

In [None]:
nip_dict = {
    'NIP' : ['123', '124', '152', '137', '183'],
    'job' : ['Accounting', 'Engineering', 'Engineering', 'HR', 'Data'],
    'city': ['Texas', 'Ohio', 'New York', 'Nevada', 'Florida']
}

In [None]:
df_nip = pd.DataFrame(nip_dict)

In [None]:
df_nip.columns

In [None]:
df_nip.columns = ["nip", "pekerjaan", "kota"]
df_nip

## Indexing DataFrame

In [None]:
nip_dict = {
    'NIP' : ['123', '124', '152', '137', '183'],
    'job' : ['Accounting', 'Engineering', 'Engineering', 'HR', 'Data'],
    'city': ['Texas', 'Ohio', 'New York', 'Nevada', 'Florida']
}

df_nip = pd.DataFrame(nip_dict)
df_nip

In [None]:
df_nip.columns

In [None]:
sorted(df_nip.columns)

In [None]:
new_columns = [col_name.lower() for col_name in df_nip.columns]
new_columns

In [None]:
new_columns = [col_name.lower() for col_name in df_nip.columns]
sorted_new_columns = sorted(new_columns)
sorted_new_columns

In [None]:
new_columns = [col_name.lower() for col_name in df_nip.columns]
new_columns.sort()
new_columns

In [None]:
df_nip.reindex(sorted(df_nip.columns), axis=1 )

In [None]:
df_nip = df_nip.reindex(sorted(df_nip.columns), axis=1)
df_nip

In [None]:
df_nip.set_index('NIP', inplace=True)

In [None]:
df_nip

In [None]:
df_nip.drop('123')

In [None]:
df_nip

In [None]:
dropCols = ['city', 'job']
df_nip.drop(labels = dropCols, axis = 1)

In [None]:
df_nip

In [None]:
rename_rules = {
    'job'  : 'Pekerjaan',
    'city' : 'Kota'
}

df_nip.rename(rename_rules, axis = 1, inplace = True)

In [None]:
df_nip

# 2. Load datasets

In [None]:
job_dict = {
    'name': ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony'],
    'job' : ['Accounting', 'Engineering', 'Engineering', 'HR', 'Data']
}

In [None]:
hire_date_dict = {
    'employee'  : ['Lisa', 'Bob', 'Jake', 'Sue', 'Cecile'],
    'hire_date' : [2004, 2008, 2012, 2014, 2010]
}

In [None]:
salary_dict = {
    'name': ['Tony', 'Sue', 'Cecile', 'Bob', 'Jake'],
    'salary': [150, 100, 120, 140, 160]
}

In [None]:
df_job    = pd.DataFrame(job_dict)
df_hire   = pd.DataFrame(hire_date_dict)
df_salary = pd.DataFrame(salary_dict)

# 3. Show Datasets

In [None]:
df_job

In [None]:
df_hire

In [None]:
df_salary

# 4. Join Table

## Copying problem

In [None]:
list_1 = [1, 2, 4]

In [None]:
list_2 = list_1

In [None]:
list_2

In [None]:
list_2[0] = 100
list_2

In [None]:
list_1

In [None]:
list_2 = list_1.copy()

In [None]:
list_2, list_1

In [None]:
list_2[0] = 1

In [None]:
list_2, list_1

## Merge method
Format: df = pd.merge(left_table, right_table, how="left", on=["col1", "col2"])

In [None]:
"""
 left: DataFrame | Series,
 right: DataFrame | Series,
 how: str = "inner",
 on: IndexLabel | None = None,
 left_on: IndexLabel | None = None,
 right_on: IndexLabel | None = None,
 left_index: bool = False,
 right_index: bool = False,
 sort: bool = False,
 suffixes: Suffixes = ("_x", "_y"),
 copy: bool = True,
 indicator: bool = False,
 validate: str | None = None) -> DataFrame
"""

In [None]:
print("df_job")
display(df_job)
print("\ndf_salary")
display(df_salary)
print("\ndf_hire")
display(df_hire)

In [None]:
pd.merge(
    df_job,
    df_salary,
    how = "left",
    on  = ["name"]
)

In [None]:
pd.merge(
    df_salary,
    df_job,
    how = "left",
    on  = ["name"]
)

In [None]:
pd.merge(
    df_job,
    df_hire,
    how = "left",
    left_on  = "name" ,
    right_on = "employee"
)

In [None]:
pd.merge(
    df_hire,
    df_salary,
    how = "right",
    left_on  = "employee",
    right_on = "name"
)

In [None]:
pd.merge(
    df_job,
    df_salary,
    on = ["name"]
) # by default how = 'inner'

In [None]:
pd.merge(
    df_job,
    df_salary,
    how = 'inner',
    on  = "name"
)

In [None]:
pd.merge(
    df_job,
    df_hire.rename(columns = {"employee": "name"}),
    on = ["name"]
)

In [None]:
df_hire

In [None]:
import pandas as pd

data_1 = {
    "P":['a', 'b'],
    "Q":[1, 2]
}

data_2 = {
    "P":['c', 'd'],
    "Q":[3, 4]
}

df1 = pd.DataFrame(data_1, index=["X", "Y"])
df2 = pd.DataFrame(data_2, index=["X", "Y"])

display(df1)
display(df2)

Unnamed: 0,P,Q
X,a,1
Y,b,2


Unnamed: 0,P,Q
X,c,3
Y,d,4


In [None]:
df1.merge(df2, on="P", how="left", suffixes= ("_left", "_right"))

Unnamed: 0,P,Q_left,Q_right
0,a,1,
1,b,2,


In [None]:
print("axis 1")
pd.concat([df1, df2], axis=1)



axis 1


Unnamed: 0,P,Q,P.1,Q.1
X,a,1,c,3
Y,b,2,d,4


In [None]:
print("axis 0")
pd.concat([df1, df2], axis=0)

axis 0


Unnamed: 0,P,Q
X,a,1
Y,b,2
X,c,3
Y,d,4


## Join Method
Format: df = df1.join(df2, how="left", on=["col1", "col2"])

In [None]:
df_job

In [None]:
df_salary

In [None]:
df_salary.set_index("name")

In [None]:
df_salary

In [None]:
# wrong way!

df_job.join(
    df_salary,
    how = 'left',
    on  = "name"
)

In [None]:
df_job.set_index("name").join(
    df_salary.set_index("name"),
    how = 'left',
)

In [None]:
df_job.join(
    df_salary.set_index("name"),
    how = 'left',
    on  = ["name"]
)

# Concatenate Dataframe
axis = 1 -> column-wise <br>
axis = 0 -> rows-wise <br>
Syntax: df = pd.concat([df1, df2], axis = ...)

In [None]:
df_job2 = pd.DataFrame(
    {
        'name': ['Drake', 'Brody', 'Thomas', 'Sylphy', 'Penny'],
        'job' : ['Data', 'HR', 'Data', 'Accounting', 'Engineer']
    }
)

df_salary2 = pd.DataFrame(
    {
        'salary': [150, 100, 120, 140, 160]
    }
)

In [None]:
df_job2

In [None]:
df_salary2

## row-wise (axis=0)

In [None]:
df_job

In [None]:
pd.concat([df_job, df_job2], axis=0, ignore_index=True) #row-wise

In [None]:
df_concat = pd.concat([df_job, df_job2], axis=0) #row-wise

In [None]:
df_concat

In [None]:
df_concat.drop(0, axis = 0)

In [None]:
pd.concat([df_job, df_job2], axis = 0).reset_index(drop=True)

## column-wise (axis=1)

In [None]:
df_job2

In [None]:
df_salary2

In [None]:
pd.concat([df_job2, df_salary2], axis = 1) #.reset_index(drop = True)

In [None]:
job_salary1 = pd.merge(df_job, df_salary, on = "name")

job_salary2 = pd.concat([df_job2, df_salary2], axis = 1).reset_index(drop = True)

pd.concat([job_salary1, job_salary2], axis = 0).reset_index(drop = True)

# Append Dataframe
Format: df = df1.append(df2)

In [None]:
#df_job
#df_job2

In [None]:
df_job.append(df_job2).reset_index(drop = True)

# Indexing in DF

In [None]:
job = pd.DataFrame(
    {
        'name'     : ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony',"John"],
        'job'      : ['Accounting', 'Engineering', 'Engineering', 'Accounting', 'Data',"Data"],
        'hire_date': [2004, 2008, 2012, 2014, 2010, 2018],
        'salary'   : [150, 100, 120, 140, 160, 110]
    }
)

In [None]:
job

In [None]:
# Informasi Bob sampai Sue
job.iloc[0:4, :]

In [None]:
# Informasi Bob sampai Sue
job.loc[0:3, :]

In [None]:
# Informasi nama, pekerjaan dan gaji dari Bob, Lisa, dan John
job.iloc[[0,2,5], [0,1,3]]

In [None]:
# print([*range(100)])
even_indices = [idx for idx in range(100) if idx%2 == 0]
even_indices

In [None]:
job

In [None]:
job2 = job.set_index("name")
job2

In [None]:
job2.loc["Bob":"Tony", "job":"hire_date"]

In [None]:
job2.loc[["Bob","Lisa","Tony"], ["job","salary"]]

In [None]:
job.iloc[0:5].loc[:, "name":"hire_date"] #not proper
#job.loc[0:4, "name":"hire_date"] #proper one

# Pivoting Table
Format: table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], aggfunc={'D': np.mean, 'E': np.mean}, fill_value=0)

In [None]:
job

In [None]:
pd.pivot_table(
    data    = job,
    index   = ["job"],
    values  = ["salary"],
    aggfunc = "mean"
)

In [None]:
pd.pivot_table(
    data       = job,
    index      = ["name"],
    values     = ["salary"],
    columns    = ["job"],
    aggfunc    = "sum",
    margins    = True,
    #fill_value = 0
)

In [None]:
[["english"] * 4, ["math"] * 4, ["physics"] * 4], []

In [None]:
score = {
    "student": ["Andy", "Bernie", "Cindy", "Deb"] * 3,
    "school" : ["Z", "Y"] * 6,
    "class"  : ["english"] * 4 + ["math"] * 4 + ["physics"] * 4,
    "grade"  : [i*j for i in range(1,4) for j in [10, 100, 1000, 10000]],
    "active" : [i*j for i in range(1,4) for j in [10, 100, 1000, 10000]]
}

score = pd.DataFrame(score)
score

In [None]:
pd.pivot_table(
    score,
    index    = ["student", "school"],
    columns  = ["class"],
    values   = "grade",
    aggfunc  = "mean",
    # margins  = True
)

In [None]:
pd.pivot_table(
    score,
    values  = ['grade', 'active'],
    index   = ["student", "school"],
    aggfunc = {'grade': np.mean, 'active': np.max},
    fill_value = 0
)

# Melting Table
Format: pd.melt(df, id_vars=["col1", "col2"], var_name="var_name", value_name="value_name")

In [None]:
d1 = {
    "Name"  : ["Adi", "Lisa", "Budi"],
    "ID"    : [1, 2, 3],
    "Role"  : ["CEO", "Editor", "Author"],
    "Salary": [10,20,30]
}

df = pd.DataFrame(d1)

In [None]:
df

In [None]:
df_melted = pd.melt(df, id_vars=["Name", "ID"])
df_melted

In [None]:
job = pd.DataFrame(
    {
        'name': ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony',"John"],
        'job' : ['Accounting', 'Engineering', 'Engineering', 'Accounting', 'Data',"Data"],
        'salary_january' : [100, 130, 160, 180, 130, 190],
        'salary_february': [150, 100, 120, 140, 160, 110]
    }
)

In [None]:
job

In [None]:
pd.melt(
    job,
    id_vars    = ["name", "job"],
    var_name   = "month",
    value_name = "salary"
)

# Lambda functions
Format -> lambda x: process(x)

In [None]:
def func(a):
  return a + 10

In [None]:
x = lambda a : a + 10
print(x(5))

In [None]:
x = lambda a, b : (a + 10) / b
print(x(5, 3))

# Lambda in Pandas

## Apply

Format:  
  df.apply(function, axis)   
  df["column_name"].apply(function, axis)

In [None]:
# creating a pandas dataframe
df = pd.DataFrame(
    {
        'name': ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony',"John"],
        'performance_score': [476, 364, 459, 410, 303, 433]
    }
)

df

In [None]:
def my_func(a):
  return (a/500)*100

In [None]:
# Applying lambda function to find percentage of 'performance_score' column using df.apply()
# df.apply(function, axis)
df["Percentage"] = df.apply(lambda x: (x['performance_score']/ 500 * 100), axis=1)
df

In [None]:
df["Percentage"] = df['performance_score'].apply(lambda x: x / 500 * 100)
df

In [None]:
df['new_col'] = df['performance_score'].apply(my_func)
df

## Assign

Format: df.assign(new_column_name = function or series, ...)

In [None]:
# creating a pandas dataframe
df = pd.DataFrame(
    {
        'name': ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony',"John"],
        'performance_score': [476, 364, 459, 410, 303, 433]
    }
)

df

In [None]:
# Applying lambda function to find percentage of 'performance_score' column using df.assign()
df = df.assign(Percentage = lambda x: (x['performance_score'] / 500 * 100))
df

## Map

In [None]:
df

In [None]:
df['name'] = df['name'].str.upper()
df

In [None]:
s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
s

In [None]:
mapping_rules = {
    'cat' : 'kitty',
    'dog' : 'puppy',
}

s.map(mapping_rules)

In [None]:
s

# Aggregating and Grouping Dataframe

count – Number of non-null observations.  
sum – Sum of values.  
mean – Mean of values.  
median – Arithmetic median of values.  
min – Minimum.  
max – Maximum.  
mode – Mode.  
std – Standard deviation.  
var – Variance

In [None]:
job = pd.DataFrame(
    {
        'name': ['Bob', 'Jake', 'Lisa', 'Sue', 'Tony',"John"],
        'job' : ['Accounting', 'Engineering', 'Engineering', 'Accounting', 'Data',"Data"],
        'performance_score': [476, 364, 459, 410, 303, 433],
        'salary': [150, 100, 120, 140, 160, 110]
    }
)

In [None]:
job

In [None]:
job.describe()

In [None]:
to_agg_cols = ["performance_score", "salary"]
job[to_agg_cols].agg(["mean", "std", "var", "median"])

In [None]:
job.agg(
    {
      "performance_score": ["min", "max"],
      "salary": ["mean", "std"]
     }
)

In [None]:
# Format: df.groupby("col1").agg()

In [None]:
groupby_cols = ["job"]
view_cols    = ["performance_score", "salary"]
agg_list     = ["mean", "std", "median"]

job.groupby(groupby_cols)[view_cols].agg(agg_list)

In [None]:
job.groupby(groupby_cols)[view_cols].mean()

In [None]:
job.groupby(groupby_cols)[view_cols].std()

In [None]:
job.groupby(groupby_cols)[view_cols].median()

In [None]:
col_agg = {
    "performance_score": ["min", "max"],
    "salary": ["mean", "std"]
}

job.groupby(groupby_cols).agg(col_agg)