In [1]:
import pandas as pd

# 1.&nbsp; Import a csv file to DataFrame


In [2]:
url = "https://drive.google.com/file/d/1Vu0q91qZw6lqhIqbjoXYvYAQTmVHh6uZ/view?usp=share_link" # orders.csv
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
orders = pd.read_csv(path)

In [None]:
pd.options.display.max_rows = 10

In [3]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


There's 5 missing values in total paid

# 2.&nbsp; Clean up missing & duplicates 

## 2.1 Check for Missing Value

In [4]:
orders.isna().sum()

order_id        0
created_date    0
total_paid      5
state           0
dtype: int64

As there is such a tiny amount of missing values, we will simply delete these rows, as we have enough data without them.

In [6]:
orders = orders.loc[~orders.total_paid.isna(), :]

## 2.2 Check for Duplicates

In [5]:
# check for duplicates
orders.duplicated().sum()

0

In [7]:
orders.nunique()

order_id        226904
created_date    224823
total_paid       31236
state                5
dtype: int64

In [9]:
orders.shape

(226904, 4)

[DataFrame.size](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.size.html) returns the total number of values that the DataFrame has (the number of rows multiplied by the number of columns):

In [11]:
orders.size

907616

We can check if the `.size` and `.shape` agree

In [12]:
orders.shape[0] * orders.shape[1] == orders.size

True

# 3.&nbsp; Data types

* `date` should be a datetime datatype

In [13]:
# take a look at the dataset with random row
orders.sample(10)

Unnamed: 0,order_id,created_date,total_paid,state
74671,374217,2017-07-07 13:07:27,79.99,Shopping Basket
181597,482001,2018-01-04 15:43:54,1329.0,Place Order
196259,496742,2018-01-23 16:24:28,79.99,Pending
191829,492311,2018-01-16 18:36:59,1849.59,Place Order
223459,523951,2018-03-09 18:24:54,368.99,Shopping Basket
9484,308955,2017-01-13 17:25:52,59.99,Shopping Basket
46845,346338,2017-04-15 16:01:42,272.99,Cancelled
226628,527121,2018-03-14 11:59:58,1316.0,Shopping Basket
33412,332896,2017-03-07 11:58:51,819.8,Shopping Basket
81785,381359,2017-07-24 13:52:42,43.98,Shopping Basket


## 3.1 `created_date` should become datetime datatype

In [14]:
orders["created_date"] = pd.to_datetime(orders["created_date"])

In [15]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226904 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   order_id      226904 non-null  int64         
 1   created_date  226904 non-null  datetime64[ns]
 2   total_paid    226904 non-null  float64       
 3   state         226904 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 8.7+ MB


In [16]:
orders_cl=orders

In [17]:
orders_cl.nlargest(5, "total_paid")

Unnamed: 0,order_id,created_date,total_paid,state
96028,395611,2017-09-01 20:56:34,214747.53,Shopping Basket
2478,301934,2017-01-03 19:27:13,214746.63,Shopping Basket
183927,484334,2018-01-07 15:58:26,214714.31,Shopping Basket
61659,361200,2017-06-01 11:55:02,214642.47,Shopping Basket
40557,340045,2017-03-28 13:27:52,214517.58,Shopping Basket


In [18]:
orders_cl.nsmallest(5, "total_paid")

Unnamed: 0,order_id,created_date,total_paid,state
150,296010,2017-01-09 23:47:00,0.0,Completed
264,299605,2017-01-01 10:33:46,0.0,Shopping Basket
308,299651,2017-01-01 12:23:58,0.0,Shopping Basket
377,299731,2017-01-01 14:52:18,0.0,Shopping Basket
380,299734,2017-01-01 14:55:20,0.0,Shopping Basket


Don't forget to download/save your new DataFrames. Also, give them an obvious name, so that you know they are the cleaned version and not the original DataFrame.

In [19]:
from google.colab import files
orders_cl.to_csv("orders_cl.csv", index=False)
files.download("orders_cl.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>