<a href="https://colab.research.google.com/github/cleysonl/ML_Bootcamp_CLL/blob/master/Data_Processing_and_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Processing and Analysis**

In [0]:
import datetime
import random
from random import randrange
import numpy as np
import pandas as pd

In [0]:
def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1
        
        

def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """

    # sentinels
    startDate = datetime.datetime(2016, 1, 1, 13)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000

    # base list of attributes
    data_dict = {
        'Serial No':
        np.arange(row_count) + serial_number_sentinel,
        'Date':
        np.random.permutation(
            pd.to_datetime([
                x.strftime("%d-%m-%Y")
                for x in _random_date(startDate, row_count)
            ]).date),
        'User ID':
        np.random.permutation(
            np.random.randint(0, row_count, size=int(row_count / 10)) +
            user_id_sentinel).tolist() * 10,
        'Product ID':
        np.random.permutation(
            np.random.randint(0, row_count, size=int(row_count / 10)) +
            product_id_sentinel).tolist() * 10,
        'Quantity Purchased':
        np.random.permutation(np.random.randint(1, 42, size=row_count)),
        'Price':
        np.round(
            np.abs(np.random.randn(row_count) + 1) * price_sentinel,
            decimals=2),
        'User Type':
        np.random.permutation(
            [chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)])
    }

    # introduce missing values
    for index in range(int(np.sqrt(row_count))):
        data_dict['Price'][np.argmax(
            data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(
            data_dict['User Type'] == random.choice(
                data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(
            data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.
                                          choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.
                                         choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(
            data_dict['User ID']))] = -101

    # create data frame
    df = pd.DataFrame(data_dict)

    return df

## **Import dependencies**

In [0]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import preprocessing

pd.options.mode.chained_assignment = None

## **Generate dataset**

In [12]:
# Generate a dataset with 1000 rows
df = generate_sample_data(row_count=1000)
df.shape

(1000, 7)

### **Analyze generated Dataset**

In [13]:
df.head()

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,6,3690.5,n
1,-1,2016-01-01,5400,1054,7,159.62,n
2,1002,,5304,124,3,4280.63,n
3,1003,2016-03-02,5493,903,6,2157.57,n
4,1004,,5141,241,22,6850.55,n


**Dataframe stats**

In [14]:
print('Number of rows:', df.shape[0])

Number of rows: 1000


In [15]:
print('Number of columns:', df.shape[1])

Number of columns: 7


In [17]:
print('Column data types: \n', df.dtypes)

Column data types: 
 Serial No               int64
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int64
Price                 float64
User Type              object
dtype: object


In [19]:
print('Columns with missing values:', df.columns[df.isnull().any()].tolist())

Columns with missing values: ['Date', 'Price']


In [21]:
print('Number of rows with Missing values:', len(pd.isnull(df).any(1).nonzero()[0].tolist()))

Number of rows with Missing values: 60


  """Entry point for launching an IPython kernel.


**General Stats**

In [22]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Serial No             1000 non-null int64
Date                  969 non-null object
User ID               1000 non-null int64
Product ID            1000 non-null int64
Quantity Purchased    1000 non-null int64
Price                 969 non-null float64
User Type             1000 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 54.8+ KB
None


In [23]:
print(df.describe())

         Serial No      User ID   Product ID  Quantity Purchased        Price
count  1000.000000  1000.000000  1000.000000          1000.00000   969.000000
mean   1454.085000  5485.533000   641.354000            21.01600  2358.206770
std     384.806125   342.345178   291.477563            11.92451  1607.567586
min      -1.000000  -101.000000     0.000000             1.00000    11.860000
25%    1228.750000  5228.000000   376.000000            11.00000  1090.010000
50%    1484.500000  5506.000000   664.000000            21.00000  2198.970000
75%    1740.250000  5722.500000   909.000000            31.00000  3366.410000
max    1999.000000  5999.000000  1095.000000            41.00000  9500.200000


###

 ### **Standardize Columns**