# Data Load and Wrangle

## The goal of the exercise is to load a dataset, clean and transform and compute descriptive statistics

In [1]:
# This piece of code enables to display multiple output within a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

<div class="alert alert-block alert-warning">
    
## Task Instructions 
### Load one of the built in dataset
### Transform the data as required 
### From a perspective of a journalist interested in writing a story about the data, **_ask and answer_** **3** questions that summarize the sample. 
    
#### Please *_only use summary statistics_* (mean, median, mode, standard deviation, variance, range,..). The statistics can be computed by groups.

#### Reference :https://kolesnikov.ga/Datasets_in_Python/

</div>

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm 
co2 = sm.datasets.co2
print(co2.DESCRLONG)


Atmospheric CO2 from Continuous Air Samples at Mauna Loa Observatory, Hawaii, U.S.A.

Period of Record: March 1958 - December 2001

Methods: An Applied Physics Corporation (APC) nondispersive infrared gas analyzer was used to obtain atmospheric CO2 concentrations, based on continuous data (four measurements per hour) from atop intake lines on several towers. Steady data periods of not less than six hours per day are required; if no such six-hour periods are available on any given day, then no data are used that day. Weekly averages were calculated for most weeks throughout the approximately 44 years of record. The continuous data for year 2000 is compared with flask data from the same site in the graphics section.


In [6]:
# Journalist: "How many samples were recorded?"
print(co2.NOTE)


::

    Number of observations: 2225
    Number of variables: 2
    Variable name definitions:

        date - sample date in YYMMDD format
        co2 - CO2 Concentration ppmv

    The data returned by load_pandas contains the dates as the index.



In [None]:
# answer: "2225 samples were taken."

In [3]:
dataset_co2=co2.load_pandas()



In [4]:
df_co2=dataset_co2.data
df_co2.head()
    

Unnamed: 0,co2
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6
1958-04-19,317.5
1958-04-26,316.4


In [5]:
sm.datasets.co2.load_pandas().data

Unnamed: 0,co2
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6
1958-04-19,317.5
1958-04-26,316.4
...,...
2001-12-01,370.3
2001-12-08,370.8
2001-12-15,371.2
2001-12-22,371.3


In [6]:
dataset_co2 = co2.load_pandas()
df_co2 = dataset_co2.data
df_co2.head()

Unnamed: 0,co2
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6
1958-04-19,317.5
1958-04-26,316.4


In [10]:
df_co2.mean()



co2    340.142247
dtype: float64

In [11]:
# Journalist: "what was the highest co2 value recorded in this data set?"
df_co2.describe()
#answer: "the highest level found was 373.9"

Unnamed: 0,co2
count,2225.0
mean,340.142247
std,17.003885
min,313.0
25%,324.8
50%,338.3
75%,354.8
max,373.9


In [14]:
df_co2['recordeddate']=df_co2.index
df_co2.head()


Unnamed: 0,co2,recordeddate
1958-03-29,316.1,1958-03-29
1958-04-05,317.3,1958-04-05
1958-04-12,317.6,1958-04-12
1958-04-19,317.5,1958-04-19
1958-04-26,316.4,1958-04-26


In [15]:
df_co2.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2284 entries, 1958-03-29 to 2001-12-29
Freq: W-SAT
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   co2           2225 non-null   float64       
 1   recordeddate  2284 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 53.5 KB


In [17]:
df_co2['recordedyear']=df_co2['recordeddate'].dt.year
df_co2.head()


Unnamed: 0,co2,recordeddate,recordedyear
1958-03-29,316.1,1958-03-29,1958
1958-04-05,317.3,1958-04-05,1958
1958-04-12,317.6,1958-04-12,1958
1958-04-19,317.5,1958-04-19,1958
1958-04-26,316.4,1958-04-26,1958


In [18]:
df_co2['recordeddate']


1958-03-29   1958-03-29
1958-04-05   1958-04-05
1958-04-12   1958-04-12
1958-04-19   1958-04-19
1958-04-26   1958-04-26
                ...    
2001-12-01   2001-12-01
2001-12-08   2001-12-08
2001-12-15   2001-12-15
2001-12-22   2001-12-22
2001-12-29   2001-12-29
Freq: W-SAT, Name: recordeddate, Length: 2284, dtype: datetime64[ns]

In [20]:
df_co2['recordedmonth']=df_co2['recordeddate'].dt.month
df_co2.head()

Unnamed: 0,co2,recordeddate,recordedyear,recordedymonth,recordedmonth
1958-03-29,316.1,1958-03-29,1958,3,3
1958-04-05,317.3,1958-04-05,1958,4,4
1958-04-12,317.6,1958-04-12,1958,4,4
1958-04-19,317.5,1958-04-19,1958,4,4
1958-04-26,316.4,1958-04-26,1958,4,4


In [21]:
# Journalist: "what were the highest and lowest levels of co2 found in this database?"
bins=[300, 325,350,400]
names=['low','medium','high']
df_co2['co2_cat']=pd.cut(df_co2['co2'], bins, labels=names)
df_co2.head()


Unnamed: 0,co2,recordeddate,recordedyear,recordedymonth,recordedmonth,co2_cat
1958-03-29,316.1,1958-03-29,1958,3,3,low
1958-04-05,317.3,1958-04-05,1958,4,4,low
1958-04-12,317.6,1958-04-12,1958,4,4,low
1958-04-19,317.5,1958-04-19,1958,4,4,low
1958-04-26,316.4,1958-04-26,1958,4,4,low


In [22]:
df_co2.tail()

Unnamed: 0,co2,recordeddate,recordedyear,recordedymonth,recordedmonth,co2_cat
2001-12-01,370.3,2001-12-01,2001,12,12,high
2001-12-08,370.8,2001-12-08,2001,12,12,high
2001-12-15,371.2,2001-12-15,2001,12,12,high
2001-12-22,371.3,2001-12-22,2001,12,12,high
2001-12-29,371.5,2001-12-29,2001,12,12,high


In [None]:
# answer: "the highest levels of Co2 were from December 2001, and the lowest were in April of 1958"

In [24]:
# Journalist:"where is the divide in between high and low co2 levels?"
df_co2.median()


  df_co2.median()


co2                338.3
recordedyear      1980.0
recordedymonth       7.0
recordedmonth        7.0
dtype: float64

In [None]:
#answer: The median occured in 1980 with a value of 338.3

In [25]:
#journalist: "may I please see the data with now the exact days instead of just year and month. "
df_co2['recordedday']=df_co2['recordeddate'].dt.day
df_co2.head()

Unnamed: 0,co2,recordeddate,recordedyear,recordedymonth,recordedmonth,co2_cat,recordedday
1958-03-29,316.1,1958-03-29,1958,3,3,low,29
1958-04-05,317.3,1958-04-05,1958,4,4,low,5
1958-04-12,317.6,1958-04-12,1958,4,4,low,12
1958-04-19,317.5,1958-04-19,1958,4,4,low,19
1958-04-26,316.4,1958-04-26,1958,4,4,low,26


In [28]:
df_co2.tail()


Unnamed: 0,co2,recordeddate,recordedyear,recordedymonth,recordedmonth,co2_cat,recordedday
2001-12-01,370.3,2001-12-01,2001,12,12,high,1
2001-12-08,370.8,2001-12-08,2001,12,12,high,8
2001-12-15,371.2,2001-12-15,2001,12,12,high,15
2001-12-22,371.3,2001-12-22,2001,12,12,high,22
2001-12-29,371.5,2001-12-29,2001,12,12,high,29
