In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datatable as dt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_rows = 999

#### Reading in the airquality data

In [33]:
airquality = dt.fread("../datasets/airquality.csv")

In [35]:
airquality = airquality.to_pandas()

In [6]:
airquality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 7 columns):
Date       153 non-null object
Ozone      116 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int32
Month      153 non-null int32
Day        153 non-null int32
dtypes: float64(3), int32(3), object(1)
memory usage: 6.7+ KB


# Melting data 

In [7]:
airquality.head()

Unnamed: 0,Date,Ozone,Solar.R,Wind,Temp,Month,Day
0,1973-05-01,41.0,190.0,7.4,67,5,1
1,1973-05-02,36.0,118.0,8.0,72,5,2
2,1973-05-03,12.0,149.0,12.6,74,5,3
3,1973-05-04,18.0,313.0,11.5,62,5,4
4,1973-05-05,,,14.3,56,5,5


In [9]:
airquality_melt=pd.melt(frame=airquality, id_vars='Date', value_vars=['Ozone', 'Solar.R', 'Wind', 'Temp'])

In [10]:
airquality_melt.head()

Unnamed: 0,Date,variable,value
0,1973-05-01,Ozone,41.0
1,1973-05-02,Ozone,36.0
2,1973-05-03,Ozone,12.0
3,1973-05-04,Ozone,18.0
4,1973-05-05,Ozone,


In [11]:
airquality_melt=pd.melt(frame=airquality, id_vars='Date', value_vars=['Ozone', 'Solar.R', 'Wind', 'Temp'], 
                        var_name='measurement', 
                       value_name = 'reading')
airquality_melt.head()

Unnamed: 0,Date,measurement,reading
0,1973-05-01,Ozone,41.0
1,1973-05-02,Ozone,36.0
2,1973-05-03,Ozone,12.0
3,1973-05-04,Ozone,18.0
4,1973-05-05,Ozone,


# Pivot() : UN-melting data

In [13]:
# While melting takes a set of columns and turns it into a single column, 
# pivoting will create a new column for each unique value in a specified column.

In [15]:
# .pivot_table() has an index parameter which you can use to 
# specify the columns that you don't want pivoted: It is similar to the id_vars parameter of pd.melt(). 
# Two other parameters that you have to specify are columns (the name of the column you want to pivot), 
# and values (the values to be used when the column is pivoted). 


In [16]:
airquality_melt.head()

Unnamed: 0,Date,measurement,reading
0,1973-05-01,Ozone,41.0
1,1973-05-02,Ozone,36.0
2,1973-05-03,Ozone,12.0
3,1973-05-04,Ozone,18.0
4,1973-05-05,Ozone,


In [25]:
airquality_melt.pivot_table(
    values='reading',
    index='Date',
    columns='measurement',
    
).head()

measurement,Ozone,Solar.R,Temp,Wind
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1973-05-01,41.0,190.0,67.0,7.4
1973-05-02,36.0,118.0,72.0,8.0
1973-05-03,12.0,149.0,74.0,12.6
1973-05-04,18.0,313.0,62.0,11.5
1973-05-05,,,56.0,14.3


In [36]:
airquality.head()

Unnamed: 0,Date,Ozone,Solar.R,Wind,Temp,Month,Day
0,1973-05-01,41.0,190.0,7.4,67,5,1
1,1973-05-02,36.0,118.0,8.0,72,5,2
2,1973-05-03,12.0,149.0,12.6,74,5,3
3,1973-05-04,18.0,313.0,11.5,62,5,4
4,1973-05-05,,,14.3,56,5,5


In [37]:
airquality.drop('Date',inplace=True, axis='columns')

In [40]:
airquality_melt = pd.melt(frame=airquality, id_vars=['Month', 'Day'], var_name='measurement', value_name='reading')

In [42]:
airquality_melt.head()

Unnamed: 0,Month,Day,measurement,reading
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,


In [57]:
airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading')

In [58]:
airquality_pivot

Unnamed: 0_level_0,measurement,Ozone,Solar.R,Temp,Wind
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1,41.0,190.0,67.0,7.4
5,2,36.0,118.0,72.0,8.0
5,3,12.0,149.0,74.0,12.6
5,4,18.0,313.0,62.0,11.5
5,5,,,56.0,14.3
5,6,28.0,,66.0,14.9
5,7,23.0,299.0,65.0,8.6
5,8,19.0,99.0,59.0,13.8
5,9,8.0,19.0,61.0,20.1
5,10,,194.0,69.0,8.6


In [65]:
airquality_pivot.index

RangeIndex(start=0, stop=153, step=1)

In [53]:
airquality_pivot.loc[(5,1),:]

measurement
Ozone       41.0
Solar.R    190.0
Temp        67.0
Wind         7.4
Name: (5, 1), dtype: float64

In [62]:
airquality_pivot.reset_index(inplace=True)

In [64]:
airquality_pivot.index

RangeIndex(start=0, stop=153, step=1)

In [74]:
airquality_dup=pd.concat([airquality_melt, airquality_melt], axis = 'rows')

In [75]:
airquality_dup.head()

Unnamed: 0,Month,Day,measurement,reading
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,


In [76]:
airquality_melt.shape

(612, 4)

In [77]:
airquality_dup.shape

(1224, 4)

In [81]:
airquality_dup.pivot_table(
    values='reading',
    index=['Month', 'Day'],
    columns='measurement', 
    aggfunc='mean'
    
    
)

Unnamed: 0_level_0,measurement,Ozone,Solar.R,Temp,Wind
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1,41.0,190.0,67.0,7.4
5,2,36.0,118.0,72.0,8.0
5,3,12.0,149.0,74.0,12.6
5,4,18.0,313.0,62.0,11.5
5,5,,,56.0,14.3
5,6,28.0,,66.0,14.9
5,7,23.0,299.0,65.0,8.6
5,8,19.0,99.0,59.0,13.8
5,9,8.0,19.0,61.0,20.1
5,10,,194.0,69.0,8.6


# Split() and Get()

In [82]:
ebola=pd.read_csv("../datasets/ebola_data_db_format.csv")

In [83]:
ebola.head()

Unnamed: 0,Indicator,Country,Date,value
0,"Cumulative number of confirmed, probable and s...",Guinea,2015-03-10,3285.0
1,Cumulative number of confirmed Ebola cases,Guinea,2015-03-10,2871.0
2,Cumulative number of probable Ebola cases,Guinea,2015-03-10,392.0
3,Cumulative number of suspected Ebola cases,Guinea,2015-03-10,22.0
4,"Cumulative number of confirmed, probable and s...",Guinea,2015-03-10,2170.0
