In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('dirty_cafe_sales.csv')
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [None]:
df.shape

(10000, 8)

#standardizing column names

In [None]:
df.columns = df.columns.str.strip().str.replace(' ','_').str.lower()

#Handling duplicates

In [None]:
df['transaction_id'].nunique() #NO duplicate entries found even at schema level

10000

#Fixing data types

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    10000 non-null  object
 1   item              9667 non-null   object
 2   quantity          9862 non-null   object
 3   price_per_unit    9821 non-null   object
 4   total_spent       9827 non-null   object
 5   payment_method    7421 non-null   object
 6   location          6735 non-null   object
 7   transaction_date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [None]:
'''
except for the key identifiers, like transaction id, keep its dtype the way it is given by default. this is because it is not used for
any math purposes and hence is not even explicitly counted to be a numeric column. ONLY care for numeric columns when changing dtypes as
obj is by default.
'''

'''
in each and every column, when values are missing, it has 'ERROR', 'UNKNOWN', or 'NaN' placed there.
'''
#numeric
df['quantity'].unique()
df['price_per_unit'].unique()
df['total_spent'].unique()

#datetime
df['transaction_date'].unique()

#categorical
df['item'].unique()
df['location'].unique()
df['payment_method'].unique()

array(['Credit Card', 'Cash', 'UNKNOWN', 'Digital Wallet', 'ERROR',
       'Missing'], dtype=object)

In [None]:
df['quantity'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['price_per_unit'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['total_spent'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['transaction_date'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['item'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['location'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
df['payment_method'].replace(['ERROR', 'UNKNOWN', 'Missing'], np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['quantity'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price_per_unit'].replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

In [None]:
#to float:-
df['quantity']=df['quantity'].astype('float')
df['price_per_unit']=df['price_per_unit'].astype('float')
df['total_spent']=df['total_spent'].astype('float')

#to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9086 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    9086 non-null   object        
 1   item              9086 non-null   object        
 2   quantity          9086 non-null   float64       
 3   price_per_unit    9086 non-null   float64       
 4   total_spent       9086 non-null   float64       
 5   payment_method    6222 non-null   object        
 6   location          9086 non-null   object        
 7   transaction_date  9086 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 638.9+ KB


#handling missing values

In [None]:
df.isnull().mean()*100

Unnamed: 0,0
transaction_id,0.0
item,0.0
quantity,0.0
price_per_unit,0.0
total_spent,0.0
payment_method,31.521021
location,0.0
transaction_date,0.0


In [None]:
#handling missing values in categorical columns:-
df['item'].fillna(df['item'].mode()[0], inplace=True)
df['payment_method'].fillna('Missing', inplace=True)
df['location'].fillna('Missing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['item'].fillna(df['item'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['payment_method'].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

In [None]:
#handling missing values in numerical columns:-
'''
NOTE:-whenever handling missing values in numerical columns, check for those columns which are derived from 2 or more columns. as in here, total_spent = price_per_unit*quantity.
under such situations, first impute the missing values of all the non derived columns and then, use the formula for derived columns in order to fill in their missing values.
'''
df.dropna(subset=['quantity'],inplace=True)
df['price_per_unit'].skew()
df['price_per_unit'].fillna(df['price_per_unit'].mean(), inplace=True)
df['total_spent'] = df['price_per_unit']*df['quantity']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price_per_unit'].fillna(df['price_per_unit'].mean(), inplace=True)


In [None]:
#handling missing values in datetime columns:-
df.dropna(subset=['transaction_date'], inplace=True)

In [None]:
df.isnull().mean()*100

Unnamed: 0,0
transaction_id,0.0
item,0.0
quantity,0.0
price_per_unit,0.0
total_spent,0.0
payment_method,0.0
location,0.0
transaction_date,0.0


#detecting and handling outliers

In [None]:
df['quantity'].skew() #normal
df['price_per_unit'].skew()#normal
df['total_spent'].skew()#normal

np.float64(0.8117151664654069)

In [None]:
'''
NO outliers present
'''

max_lim = df['quantity'].mean()+3*df['quantity'].std()
min_lim = df['quantity'].mean()-3*df['quantity'].std()

df[df['quantity']>max_lim]['transaction_id'].count()
df[df['quantity']<min_lim]['transaction_id'].count()

np.int64(0)

In [None]:
'''
NO outliers present
'''
max_lim = df['price_per_unit'].mean()+3*df['price_per_unit'].std()
min_lim = df['price_per_unit'].mean()-3*df['price_per_unit'].std()

df[df['price_per_unit']>max_lim]['transaction_id'].count()
df[df['price_per_unit']<min_lim]['transaction_id'].count()

np.int64(0)

In [None]:
'''
NO outliers present
'''
max_lim = df['total_spent'].mean()+3*df['total_spent'].std()
min_lim = df['total_spent'].mean()-3*df['total_spent'].std()

df[df['total_spent']>max_lim]['transaction_id'].count()
df[df['total_spent']<min_lim]['transaction_id'].count()

np.int64(0)

In [None]:
df['quantity']=df['quantity'].astype('int')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9086 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    9086 non-null   object        
 1   item              9086 non-null   object        
 2   quantity          9086 non-null   int64         
 3   price_per_unit    9086 non-null   float64       
 4   total_spent       9086 non-null   float64       
 5   payment_method    9086 non-null   object        
 6   location          9086 non-null   object        
 7   transaction_date  9086 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 638.9+ KB
