# Removing NaN

In [20]:
import pandas as pd

df = pd.read_csv('../../testDatasets/coffee_sales.csv')

df = df.drop(columns=['card']) #, 'datetime'])

# Assuming df is your DataFrame
print(f"Length of DataFrame: {len(df)}")
df = df.dropna(how='any', axis=0)
print(f"Length of DataFrame after dropping rows with any NaN values: {len(df)}")
df.head()

Length of DataFrame: 3636
Length of DataFrame after dropping rows with any NaN values: 3636


Unnamed: 0,date,datetime,cash_type,money,coffee_name
0,2024-03-01,2024-03-01 10:15:50.520,card,38.7,Latte
1,2024-03-01,2024-03-01 12:19:22.539,card,38.7,Hot Chocolate
2,2024-03-01,2024-03-01 12:20:18.089,card,38.7,Hot Chocolate
3,2024-03-01,2024-03-01 13:46:33.006,card,28.9,Americano
4,2024-03-01,2024-03-01 13:48:14.626,card,38.7,Latte


# Dealing with 'datetime' & 'date'

In [21]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
df['datetime'] = pd.to_datetime(df['datetime'])
df['timestamp'] = df['datetime'].apply(lambda x: int(x.timestamp()))

# If you want to verify that 'date' column is redundant and can be dropped
# df['date'] = pd.to_datetime(df['date'])
# assert (df['datetime'].dt.date == df['date']).all()

# Drop the original date and datetime columns if needed
df = df.drop(columns=['date', 'datetime'])
df.head()


Unnamed: 0,cash_type,money,coffee_name,timestamp
0,card,38.7,Latte,1709288150
1,card,38.7,Hot Chocolate,1709295562
2,card,38.7,Hot Chocolate,1709295618
3,card,28.9,Americano,1709300793
4,card,38.7,Latte,1709300894


# Dealing with 'cash_type'

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['cash_type'] = le.fit_transform(df['cash_type'])
df[df['cash_type'] != 1].head()

Unnamed: 0,cash_type,money,coffee_name,timestamp
0,0,38.7,Latte,1709288150
1,0,38.7,Hot Chocolate,1709295562
2,0,38.7,Hot Chocolate,1709295618
3,0,28.9,Americano,1709300793
4,0,38.7,Latte,1709300894


# Dealing with 'coffee_name'

In [24]:
df_encoded = pd.get_dummies(df, columns=['coffee_name'])

In [26]:
df_encoded.head()

Unnamed: 0,cash_type,money,timestamp,coffee_name_Americano,coffee_name_Americano with Milk,coffee_name_Cappuccino,coffee_name_Cocoa,coffee_name_Cortado,coffee_name_Espresso,coffee_name_Hot Chocolate,coffee_name_Latte
0,0,38.7,1709288150,False,False,False,False,False,False,False,True
1,0,38.7,1709295562,False,False,False,False,False,False,True,False
2,0,38.7,1709295618,False,False,False,False,False,False,True,False
3,0,28.9,1709300793,True,False,False,False,False,False,False,False
4,0,38.7,1709300894,False,False,False,False,False,False,False,True


In [29]:
# converting to lowercase
df_encoded.columns = df_encoded.columns.str.lower()
df_encoded.head()

Unnamed: 0,cash_type,money,timestamp,coffee_name_americano,coffee_name_americano with milk,coffee_name_cappuccino,coffee_name_cocoa,coffee_name_cortado,coffee_name_espresso,coffee_name_hot chocolate,coffee_name_latte
0,0,38.7,1709288150,False,False,False,False,False,False,False,True
1,0,38.7,1709295562,False,False,False,False,False,False,True,False
2,0,38.7,1709295618,False,False,False,False,False,False,True,False
3,0,28.9,1709300793,True,False,False,False,False,False,False,False
4,0,38.7,1709300894,False,False,False,False,False,False,False,True


In [30]:
df_encoded.to_csv('../../testDatasets/coffee_sales_cleaned.csv', index=False)