## Importing Data with NumPy

In [2]:
import numpy as np

### np.loadtxt() vs np.genfromtxt()

In [None]:
# loadtxt() implies the data is ready to be directly imported and used
# genfromtxt() indicates that the function creates the dataset from the text file

In [6]:
lending_co_data_numeric1 = np.loadtxt('Lending-Company-Numeric-Data.csv', delimiter = ',')
lending_co_data_numeric1

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [8]:
lending_co_data_numeric2 = np.genfromtxt('Lending-Company-Numeric-Data.csv', delimiter = ',')
lending_co_data_numeric2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [11]:
np.array_equal(lending_co_data_numeric1, lending_co_data_numeric2) # Identical dtasets

# loadtxt() is faster, but it breajs when we feed it incomplete or ill-formated datasets
# genfromtxt() is slower, but can handle missing values

True

In [14]:
lending_co_data_numeric_nan = np.loadtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ';')
lending_co_data_numeric_nan

ValueError: could not convert string '' to float64 at row 11, column 4.

In [17]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ';')
lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [20]:
lending_co_data_numeric_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                            delimiter = ';',
                                            dtype = np.str_)
lending_co_data_numeric_nan

array([['2000', '40', '365', '3121', '4241', '13621'],
       ['2000', '40', '365', '3061', '4171', '15041'],
       ['1000', '40', '365', '2160', '3280', '15340'],
       ...,
       ['', '40', '365', '4201', '5001', '16600'],
       ['1000', '40', '365', '2080', '3320', '15600'],
       ['2000', '40', '365', '4601', '4601', '16600']], dtype='<U5')

### Partial Cleaning While Importing

In [21]:
# Basic import for comparison
lending_co_data_numeric_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ';')
lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [23]:
lending_co_data_numeric_nan = np.genfromtxt(
    'Lending-Company-Numeric-Data-NAN.csv', 
    delimiter = ';', 
    skip_header = 2) # Can be used to skip comments and notes from dataset owners

lending_co_data_numeric_nan

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [24]:
lending_co_data_numeric_nan = np.genfromtxt(
    'Lending-Company-Numeric-Data-NAN.csv', 
    delimiter = ';', 
    skip_footer = 2) # Same as before but at the bottom of the dataset

lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  3401.,    nan, 16600.],
       [ 2000.,    40.,   365.,    nan,  5440., 16600.],
       [   nan,    40.,   365.,  4201.,  5001., 16600.]])

In [31]:
lending_co_data_numeric_nan = np.genfromtxt(
    'Lending-Company-Numeric-Data-NAN.csv', 
    delimiter = ';', 
    usecols = (5,0,1)) 
    # Returns only the first column, to select specific columns specify them together as a tuple. 
    # Orders can be altered

lending_co_data_numeric_nan

array([[13621.,  2000.,    40.],
       [15041.,  2000.,    40.],
       [15340.,  1000.,    40.],
       ...,
       [16600.,    nan,    40.],
       [15600.,  1000.,    40.],
       [16600.,  2000.,    40.]])

In [32]:
lending_co_data_numeric_nan = np.genfromtxt(
    'Lending-Company-Numeric-Data-NAN.csv', 
    delimiter = ';', 
    usecols = (5,0,1),
    skip_header = 2,
    skip_footer = 2) 

lending_co_data_numeric_nan

array([[15340.,  1000.,    40.],
       [15321.,  2000.,    40.],
       [13720.,  2000.,    50.],
       ...,
       [16600.,  2000.,    40.],
       [16600.,  2000.,    40.],
       [16600.,    nan,    40.]])

In [8]:
# Tuple Assignment (Assigning each column to a tuple)
lending_co_data_5, lending_co_data_0, lending_co_data_1 = np.genfromtxt(
    'Lending-Company-Numeric-Data-NAN.csv', 
    delimiter = ';', 
    usecols = (5,0,1),
    skip_header = 2,
    skip_footer = 2,
    unpack = True) 

print("Column #4:", lending_co_data_1)
print("Column #1:", lending_co_data_2)
print("Column #2:", lending_co_data_3)

Column #4: [40. 40. 50. ... 40. 40. 40.]
Column #1: [1000. 2000. 2000. ... 2000. 2000.   nan]
Column #2: [40. 40. 50. ... 40. 40. 40.]


In [None]:
### Takeaways:

### The "np.genfromtxt()" is a NumPy function to load data from a an existing text file. While it is slower to loadtxt(),
### it can handle missing values as specified. 

### The "skip_header" and "skip_footer" properties from that function can be used to skip the specified rows of 
### data respectively.

### The "usecols" property allows to specify which column from the dataset/established table will be assigned/used.

### The "unpack" property allows to use tuple assignment to store the columns from the dataset and use them separately.