# Data Transformation using Pandas
### Data Science Pipeline Workshop 11 Juni 2022
- Author : Randy Galawana
- Email  : randy_galawana1@telkomsel.co.id

&copy; Telkomsel 2022

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('https://github.com/galawana/tselworkshop1/raw/main/data/imdb_database.csv')

In [None]:
df.columns

In [None]:
df.head()

### Pivot

In [None]:
df['country_group'] = df['country'].apply(lambda x: x if x in ['USA', 'Hong Kong', 'Japan', 'UK'] else 'Others')
df['budget_USD_mio'] = df['budget_USD']/1000000

In [None]:
df_pivot = df.pivot_table(index='title_year', columns='country_group', values='budget_USD_mio').reset_index()

In [None]:
df_pivot.fillna(0, inplace=True)

In [None]:
df_pivot[df_pivot['title_year'] > 1990]

### Melt

In [None]:
df_test = df[['movie_ID', 'movie_title', 'content_rating', 'language', 'color']].sample(n=10)

In [None]:
df_test

In [None]:
df_melt = df_test.melt(id_vars=['movie_ID', 'movie_title'], var_name='option', value_name='value')

In [None]:
df_melt.head(10)

### stack
only working with multi indexed dataframe, ex : pivotted dataframe

In [None]:
df_pivot = df.pivot_table(index=['title_year', 'color'], columns='country_group', values='budget_USD_mio')

In [None]:
df_stack = df_pivot.stack(level=-1, dropna=False)
df_stack

### Unstack
only working with indexed data, ex Stack, Pivot

In [None]:
df_stack.unstack('color')

In [None]:
df_stack.unstack('title_year')

### Merge / Join

In [None]:
df_listings = pd.read_csv('data/airbnb_listings.csv')[['id', 'host_id', 'host_name', 'price', 'name', 'minimum_nights']]

In [None]:
df_listings.info()

In [None]:
df_listings.shape

In [None]:
df_reviews = pd.read_csv('data/airbnb_reviews.csv').sample(frac=0.4)
df_reviews.rename(columns={'date':'review_date'}, inplace=True)
df_reviews.info()

In [None]:
df_reviews.shape

#### Inner Join

In [None]:
df_listings.rename(columns={'id':'listing_id'}, inplace=True)


In [None]:
df_join_inner = df_reviews.merge(df_listings, on='listing_id', how='inner')

In [None]:
df_join_inner.head()

In [None]:
# the number of rows is same with df_reviews, it means every listing_id in df_reviews has pair in df_listings
df_join_inner.shape

In [None]:
# test if there is incomplete, there is reviews that has no pair in listings
df_listings_sample = df_listings.sample(frac=0.6)

In [None]:
# the number is decreasing because only matching listing_id is showed
df_join_inner2 = df_reviews.merge(df_listings_sample, on='listing_id', how='inner')
df_join_inner2.shape

#### Left Join

In [None]:
# all the data in left dataframe (df_reviews) will be outputed
df_join_left = df_reviews.merge(df_listings_sample, on='listing_id', how='left')
df_join_left.shape

In [None]:
# but for every data that didn;t have matched key in listings will have null values
df_join_left.info()

#### Right Join

In [None]:
# all the data in right dataframe (df_reviews) will be outputed
df_join_right = df_reviews.merge(df_listings_sample, on='listing_id', how='right')
df_join_right.shape

In [None]:
# but for every data that didn;t have matched key in reviews will have null values
df_join_right.info()

#### Outer Join

In [None]:
# all the data in left and right dataframe will be outputed
df_join_outer = df_reviews.merge(df_listings_sample, on='listing_id', how='outer')
df_join_outer.shape

In [None]:
df_join_outer.info()

### Union / Concat

In [None]:
df1 = pd.read_parquet('https://github.com/galawana/tselworkshop1/raw/main/data/userdata1.parquet')
print(df1.columns)
print(df1.shape)


In [None]:
df2 = pd.read_parquet('https://github.com/galawana/tselworkshop1/raw/main/data/userdata2.parquet')
print(df2.columns)
print(df2.shape)


In [None]:
df_concat = pd.concat([df1, df2])

In [None]:
df_concat.shape

In [None]:
df_concat.head()

### Transpose

In [None]:
df.head()

In [None]:
df.head().T

### Groupby / Aggregate

In [None]:
(
    df
    .groupby('country')
    .agg(
        count=('movie_ID','count'), 
        budget=('budget_USD', 'sum'), 
        gross=('gross_USD', 'sum'), 
        net=('net_USD', 'sum')
    )
)