# Transformation and cleaning
### Program function: cleaning and formatting raw data from an external source and giving back a cleaned and formatted new csv file

## 01 Basic data entry

In [1]:
basepath = r'C:\Users\admin\Codes\Portfolio\02_Etude\01_Project_Data_manipulating\subproject_02\data_02'
inputfolder = 'Input'
inputfile = 'Orders_2011.csv'

## 02 Read in the inputfile

In [2]:
# import the pandas
import pandas as pd

# read data from .csv file into dataframe
df = pd.read_csv(basepath + '\\' + inputfolder + '\\' + inputfile, sep = ';', encoding = 'utf-8')
print('Data is in the DataFrame.')

df

Data is in the DataFrame.


Unnamed: 0,OrderDate,Country,LineTotal,ProductName,ProductNumber,Class
0,2011.05.31 0:00,AU,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
1,2011.05.31 0:00,CA,357827,"Road-150 Red, 62",BK-R93R-62,H
2,2011.05.31 0:00,FR,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
3,2011.05.31 0:00,US,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
4,2011.05.31 0:00,DE,2024994,"Mountain-100 Black, 42",BK-M82B-42,H
...,...,...,...,...,...,...
5711,2011.12.31 0:00,US,357827,"Road-150 Red, 56",BK-R93R-56,H
5712,2011.12.31 0:00,US,357827,"Road-150 Red, 62",BK-R93R-62,H
5713,2011.12.31 0:00,US,357827,"Road-150 Red, 48",BK-R93R-48,H
5714,2011.12.31 0:00,US,357827,"Road-150 Red, 52",BK-R93R-52,H


## 03 Cleaning data

In [3]:
# Remove the extra trailing & leading spaces from the values of “Class” column
df['Class'] = df['Class'].str.strip()
df

Unnamed: 0,OrderDate,Country,LineTotal,ProductName,ProductNumber,Class
0,2011.05.31 0:00,AU,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
1,2011.05.31 0:00,CA,357827,"Road-150 Red, 62",BK-R93R-62,H
2,2011.05.31 0:00,FR,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
3,2011.05.31 0:00,US,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
4,2011.05.31 0:00,DE,2024994,"Mountain-100 Black, 42",BK-M82B-42,H
...,...,...,...,...,...,...
5711,2011.12.31 0:00,US,357827,"Road-150 Red, 56",BK-R93R-56,H
5712,2011.12.31 0:00,US,357827,"Road-150 Red, 62",BK-R93R-62,H
5713,2011.12.31 0:00,US,357827,"Road-150 Red, 48",BK-R93R-48,H
5714,2011.12.31 0:00,US,357827,"Road-150 Red, 52",BK-R93R-52,H


In [4]:
# Convert the values of “OrderDate” column to DATETIME type
df['OrderDate'] = df['OrderDate'].astype('datetime64[ns]')
df

Unnamed: 0,OrderDate,Country,LineTotal,ProductName,ProductNumber,Class
0,2011-05-31,AU,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
1,2011-05-31,CA,357827,"Road-150 Red, 62",BK-R93R-62,H
2,2011-05-31,FR,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
3,2011-05-31,US,339999,"Mountain-100 Silver, 44",BK-M82S-44,H
4,2011-05-31,DE,2024994,"Mountain-100 Black, 42",BK-M82B-42,H
...,...,...,...,...,...,...
5711,2011-12-31,US,357827,"Road-150 Red, 56",BK-R93R-56,H
5712,2011-12-31,US,357827,"Road-150 Red, 62",BK-R93R-62,H
5713,2011-12-31,US,357827,"Road-150 Red, 48",BK-R93R-48,H
5714,2011-12-31,US,357827,"Road-150 Red, 52",BK-R93R-52,H


In [5]:
# Convert the “LineTotal” column to FLOAT data type

df['LineTotal'] = df['LineTotal'].str.replace(',', '.') # decimal commas replaced by decimal points

df['LineTotal'] = pd.to_numeric(df['LineTotal'], errors = 'coerce') # convert the “LineTotal” column to numeric data type (unconvertable values replaced by NaN)

df['LineTotal'] = df['LineTotal'].astype(float) # convert the “LineTotal” column to FLOAT data type

df

Unnamed: 0,OrderDate,Country,LineTotal,ProductName,ProductNumber,Class
0,2011-05-31,AU,3399.9900,"Mountain-100 Silver, 44",BK-M82S-44,H
1,2011-05-31,CA,3578.2700,"Road-150 Red, 62",BK-R93R-62,H
2,2011-05-31,FR,3399.9900,"Mountain-100 Silver, 44",BK-M82S-44,H
3,2011-05-31,US,3399.9900,"Mountain-100 Silver, 44",BK-M82S-44,H
4,2011-05-31,DE,2024.9940,"Mountain-100 Black, 42",BK-M82B-42,H
...,...,...,...,...,...,...
5711,2011-12-31,US,3578.2700,"Road-150 Red, 56",BK-R93R-56,H
5712,2011-12-31,US,3578.2700,"Road-150 Red, 62",BK-R93R-62,H
5713,2011-12-31,US,3578.2700,"Road-150 Red, 48",BK-R93R-48,H
5714,2011-12-31,US,3578.2700,"Road-150 Red, 52",BK-R93R-52,H


In [6]:
df[['ProductName', 'Size']] = df['ProductName'].str.split(', ', expand = True) # split the column ProductName
df['Size'] = df['Size'].str.replace(r'(Black|Red|Blue)', '-', regex = True) # replace 'Black' or 'Red' or 'Blue' to '-'
df

Unnamed: 0,OrderDate,Country,LineTotal,ProductName,ProductNumber,Class,Size
0,2011-05-31,AU,3399.9900,Mountain-100 Silver,BK-M82S-44,H,44
1,2011-05-31,CA,3578.2700,Road-150 Red,BK-R93R-62,H,62
2,2011-05-31,FR,3399.9900,Mountain-100 Silver,BK-M82S-44,H,44
3,2011-05-31,US,3399.9900,Mountain-100 Silver,BK-M82S-44,H,44
4,2011-05-31,DE,2024.9940,Mountain-100 Black,BK-M82B-42,H,42
...,...,...,...,...,...,...,...
5711,2011-12-31,US,3578.2700,Road-150 Red,BK-R93R-56,H,56
5712,2011-12-31,US,3578.2700,Road-150 Red,BK-R93R-62,H,62
5713,2011-12-31,US,3578.2700,Road-150 Red,BK-R93R-48,H,48
5714,2011-12-31,US,3578.2700,Road-150 Red,BK-R93R-52,H,52


In [7]:
print('Data cleaning done.')

Data cleaning done.


## 04 Create dimension table

In [8]:
# Create a dimension table (“df_sizes”) of the values of the “Size” column
df = df.sort_values(by = ['Size']) # sort dataframe rows

df_sizes = df.drop_duplicates(subset = ['Size']) # remove duplicates by columns -> to new dataframe

df_sizes = df_sizes[ ['Size'] ] # decrease dataframe to only specific columns

df_sizes = df_sizes.reset_index(drop = True) # reset the index column

df_sizes.insert(0, 'SizeID', df_sizes.index + 1) # insert an ID column

df_sizes

Unnamed: 0,SizeID,Size
0,1,-
1,2,38
2,3,42
3,4,44
4,5,46
5,6,48
6,7,52
7,8,56
8,9,58
9,10,60


In [9]:
print('Dimension table has been created.')

Dimension table has been created.


## 05 Export to a cleaned outputfile

In [10]:
# Sort the “df” dataframe by “OrderDate” and “Country”, and reset the index
df = df.sort_values(by = ['OrderDate', 'Country']) # sorting the values by multiple columns
df = df.reset_index(drop = True) # reset the index column

import os # import the os module

outputfolder  = r'Output\2011'
outputfile = inputfile.replace('.csv', '_cleaned.csv')

os.makedirs(os.path.join(basepath, outputfolder), exist_ok = True) # create folder (if it does not exist)

df_export = df

In [11]:
# dataframe export to csv file
df_export.to_csv(basepath + '\\' + outputfolder + '\\' + outputfile, 
          index = False, sep = ';', encoding = 'utf-8')

print('Exported to CSV!')

Exported to CSV!


In [12]:
print('Program ended. A cleaned and formatted new csv file has been given.')

Program ended. A cleaned and formatted new csv file has been given.
