# Data Wrangling

## Libraries

### Install

In [1]:
#pip install missingno

### Imports

In [2]:
#Import the relevant modules
import os
import glob

import pandas as pd
import numpy as np

import datetime as dt
import datetime

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from pathlib import Path

import missingno as msno


## Read Data

In [3]:
# Change directory one step back and save as the root directory
ROOT_DIR = os.path.normpath(os.getcwd() + os.sep + os.pardir)

#Define start and end dates
fromDate = '2000-01-01'
toDate = '2022-10-20'

In [4]:
def get_file_name(file_name):
    name_ext = file_name.split("\\")[-1]
    name = name_ext.split(".")[0]
    return name

In [5]:
#Provide the name of CSV datasets
my_datasets_csv = [
    'eia_data_raw.csv',
    'fred_data_raw.csv',
    'offline_data_raw.csv']

In [6]:
# Change directory one step back and save as the root directory
ROOT_DIR = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# Change direcotry to the 'data\interim' folder
path = ROOT_DIR + '\\data\\interim\\'

#Get the full file names (including directory)
for i in range(len(my_datasets_csv)):
    my_datasets_csv[i] = path + my_datasets_csv[i]

In [7]:
# Initialize list; this is the final list that will store all the data from the json pull. 
data = []

# loop over the list of csv files
for f in my_datasets_csv:
    #print file name
    print(get_file_name(f))
    
    #read the csv file into a new dataframe
    new_data = pd.read_csv(f)
    
    #drop the header
    new_data = new_data.drop(columns = ['Unnamed: 0'])
    
    #filter using from-to-dates
    new_data = new_data.loc[(new_data['Date'] >= fromDate) & (new_data['Date'] <= toDate)]
    
    #print the dataframe shape
    print(new_data.shape)

    data.append(new_data)

eia_data_raw
(274, 20)
fred_data_raw
(274, 8)
offline_data_raw
(274, 7)


## Merge dataframes

In [8]:
#find the df with longest dataseries
max_val = data[0].shape[0]
max_idx = 0

for i in range(len(data)):    
    size = size = data[i].shape[0]
    if size>max_val:
        max_val = size
        max_idx = i

In [9]:
#initiate the df with the the longest timeseries
df = data[max_idx]
for i in range(0, len(data)):
    if i != max_idx:
        df_temp = data[i]
        df = pd.merge_ordered(df, df_temp, on = 'Date')

df.set_index('Date', drop=True, inplace=True)

In [10]:
#sort data
df.sort_index(axis = 0, inplace = True)

#### Set Date format for index


In [11]:
#import datetime

df.index = pd.to_datetime(df.index)

## Explore features

### Check data types and counts

In [13]:
df.shape

(274, 32)

In [None]:
df.info()

### Display head and tail of data

In [None]:
#newest data records
def display_df(df, n_records):
    size = len(df)
    print(f'{n_records} / {size} records from the dataframe tail:')
    display(df.tail(n_records).round(1).T)
    print(f'\n{n_records} / {size} records from the dataframe head:')
    display(df.head(n_records).round(1).T)

In [None]:
#oldest and newsr data records
display_df(df, 7)

## Missing data

In [None]:
def count_missing(df):
    ''' Count the number of missing values .isnull() in each column well as the percentages 
    Call pd.concat() to form a single table df with 'count' and '%' columns'''
    
    missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
    missing.columns=['count', '%']
    missing = missing.loc[missing['count'] > 0]
    missing.sort_values(by='count', ascending = False)
    
    return missing

### Missing data statistics

In [None]:
#Count missing data
missing_stats = count_missing(df)
display(missing_stats)

### Missing data plot

In [None]:
#Visualize the missing datausing "missingno" library
msno.matrix(df)
plt.show()

## Treat missing data

### 1. Set tolerances

In [None]:
### Tolerance for missing data %

print("""
Drop Columns:\t missing data (%) > tolerance_drop 
Drop Rows:\t tolerance_impute < missing data (%) < tolerance_drop
Impute Data:\t missing data (%) < tolerance_impute
""")

tolerance_drop = 10
tolerance_impute = 1

print('tolerance_drop =', tolerance_drop)
print('tolerance_impute =', tolerance_impute)

### 2. Drop Columns

#### Columns with NaN to drop

It is recommended to drop the  other data sources for the following featuers with more than the missing tolerance %

In [None]:
print('Columns with NaN to drop:')
drop_criteria = missing_stats['%'] > tolerance_drop 
columns_to_drop = missing_stats.loc[drop_criteria]
display(columns_to_drop.round(2))

#### Drop column and store

In [None]:
#drop select columns
dropped_col = df.drop(columns = columns_to_drop.index, inplace = False)

print('Original data:',df.shape)
print('Missing droppped:', dropped_col.shape)

### 3. Impute missing data

In [None]:
print('Columns with NaN to impute:')
impute_criteria = missing_stats['%'] < tolerance_impute
columns_to_impute = missing_stats.loc[impute_criteria]
display(columns_to_impute.round(2))

#### Plot where data is missiing

In [None]:
def plot_timeseries(df, target_feature):
    
    #Plot the target feature
    fig, ax = plt.subplots(figsize = (8,2))
    
    #line plot for target feature
    sns.lineplot(data = df, x = df.index, y = target_feature)

    #scatter plot for missing values
    df_nan = df.loc[df[target_feature].isnull()]
    #use medain value for y
    y_val = df[target_feature].median()    
    sns.scatterplot(data = df_nan, x =df_nan.index, y = y_val, marker="o", color="r")

    # assign locator and formatter for the xaxis ticks.
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

    # put the labels at 45deg since they tend to be too long
    fig.autofmt_xdate()
    
    plt.show()

In [None]:
#plot the data and the missing data locatin 'TO BE IMPUTED'
[plot_timeseries(dropped_col, item) for item in columns_to_impute.index]

In [None]:
#Impute using bfill & ffil
for item in columns_to_impute.index:
    # Ffill imputation
    dropped_col[item] = dropped_col[item].fillna(method='ffill')
    # Bfill imputation
    dropped_col[item] = dropped_col[item].fillna(method='bfill')

In [None]:
print('Missing imputed:', dropped_col.shape)

### Drop Rows

In [None]:
#Count missing data
missing_stats = count_missing(dropped_col)

In [None]:
print('Columns with NaN to drop row:')
display(missing_stats.round(2))

In [None]:
dropped_row = dropped_col.dropna(axis = 0, inplace = False)

In [None]:
print('Missing droppped:', dropped_row.shape)

### Use df with no NaNs

In [None]:
df = dropped_row

## Explore data

### Check final DataFrame

In [None]:
#Display df head and tail
display_df(df, 7)

### Traget feature

In [None]:
#Display the target feature
plt.figure(figsize = (16,7))
df['wti_price'].plot()
plt.title('WTI Spot Price')
plt.xlabel('Date')
plt.show()

### Summary Statistics

In [None]:
df.describe().round(2).T

### Histograms

In [None]:
#Plot histogram of all  features
#Call plt.subplots_adjust() with an argument hspace=0.5 to adjust the spacing
#It's important you create legible and easy-to-read plots
df.hist(figsize=(15,15))
plt.subplots_adjust(hspace=0.5);


## Store data

In [None]:
# Change direcotry to the 'data\raw' folder
path = ROOT_DIR + '\\data\\interim\\'

# Set a file name
file_save = path + 'step2_data_wrangling_wti_price.csv'
df.to_csv(file_save)

print('Save:\n', file_save)