In [19]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
# Read the dataset1.csv and dataset2.csv file
dataset1 = pd.read_csv('dataset1.csv')
dataset2 = pd.read_csv('dataset2.csv')

In [21]:
# show first 5 rows of dataset1
dataset1.head()

Unnamed: 0,name,price
0,William Dixon,109.03728
1,Kristen Horn,262.524652
2,Kimberly Chang,187.007258
3,Mary Ball,283.174648
4,Benjamin Craig,143.871582


In [22]:
# show first 5 rows of dataset2
dataset2.head()

Unnamed: 0,name,price
0,William Garcia,258.1809089
1,Barbara Freeman,141.890534
2,Rebecca Zimmerman,293.373272
3,Patricia Velasquez,249.9479246
4,Ronnie Clark,272.908659


In [23]:
# show the shapes of dataset1 and dataset2
print(f"Shape of dataset1 is {dataset1.shape}")
print(f"Shape of dataset2 is {dataset2.shape}")

Shape of dataset1 is (5000, 2)
Shape of dataset2 is (5000, 2)


In [24]:
# Preprocessing 1: Split the name field into first_name, and last_name
# We will split() the name and take the first split word as first name and the last split word as last name

# For dataset 1
dataset1.loc[dataset1['name'].str.split().str.len() == 2, 'first name'] = dataset1['name'].str.split().str[0]
dataset1.loc[dataset1['name'].str.split().str.len() == 2, 'last name'] = dataset1['name'].str.split().str[-1]

print(f"After spliting name into first and last name, dataset1: \n\n {dataset1.head()}")

# For dataset 2
dataset2.loc[dataset2['name'].str.split().str.len() == 2, 'first name'] = dataset2['name'].str.split().str[0]
dataset2.loc[dataset2['name'].str.split().str.len() == 2, 'last name'] = dataset2['name'].str.split().str[-1]

print(f"\n\nAfter spliting name into first and last name, dataset1: \n\n {dataset2.head()}")

After spliting name into first and last name, dataset1: 

              name       price first name last name
0   William Dixon  109.037280    William     Dixon
1    Kristen Horn  262.524652    Kristen      Horn
2  Kimberly Chang  187.007258   Kimberly     Chang
3       Mary Ball  283.174648       Mary      Ball
4  Benjamin Craig  143.871582   Benjamin     Craig


After spliting name into first and last name, dataset1: 

                  name        price first name  last name
0      William Garcia  258.1809089    William     Garcia
1     Barbara Freeman   141.890534    Barbara    Freeman
2   Rebecca Zimmerman   293.373272    Rebecca  Zimmerman
3  Patricia Velasquez  249.9479246   Patricia  Velasquez
4        Ronnie Clark   272.908659     Ronnie      Clark


In [25]:
dataset1.dtypes

name           object
price         float64
first name     object
last name      object
dtype: object

In [26]:
dataset2.dtypes

name          object
price         object
first name    object
last name     object
dtype: object

In [27]:
# Preprocessing 2: Remove any zeros prepended to the price field

# Convert the price column to string and then remove any prepend zeros
dataset1["price"] = dataset1["price"].astype('str')
dataset2["price"] = dataset2["price"].astype('str')

dataset1["price"] = dataset1["price"].str.lstrip('0')
dataset2["price"] = dataset2["price"].str.lstrip('0')

In [28]:
# Preprocessing 3: Delete any rows which do not have a name
# We will drop null values for the column 'name'

dataset1 = dataset1.dropna(axis=0, subset=['name'])
dataset2 = dataset2.dropna(axis=0, subset=['name'])

In [29]:
# Preprocessing 4: Create a new field named above_100, which is true if the price is strictly greater than 100
# We will use numpy where to filter values geater than 100 and assign it to True for the new column

print(f"Checking dtypes of Price column in both dataset: \n\n")
print(f"dataset1 dtypes:\n {dataset1.dtypes}\n\n")
print(f"dataset2 dtypes: \n{dataset2.dtypes}\n\n")

print(f"Changing dtype of dataset1['price'] and dataset2['price'] to float64\n\n")
dataset1["price"] = dataset1.price.astype("float64")
dataset2["price"] = dataset2.price.astype("float64")

dataset1['above_100'] = np.where(dataset1['price'] > 100.0, True, False)
dataset2['above_100'] = np.where(dataset2['price'] > 100.0, True, False)

print(f"conversion finished")

Checking dtypes of Price column in both dataset: 


dataset1 dtypes:
 name          object
price         object
first name    object
last name     object
dtype: object


dataset2 dtypes: 
name          object
price         object
first name    object
last name     object
dtype: object


Changing dtype of dataset1['price'] and dataset2['price'] to float64


conversion finished


In [30]:
# printing both datasets after processing

print(f"dataset1:\n\n {dataset1.head(10)}")
print(f"\ndataset2:\n\n {dataset2.head(10)}")

dataset1:

                 name       price first name last name  above_100
0      William Dixon  109.037280    William     Dixon       True
1       Kristen Horn  262.524652    Kristen      Horn       True
2     Kimberly Chang  187.007258   Kimberly     Chang       True
3          Mary Ball  283.174648       Mary      Ball       True
4     Benjamin Craig  143.871582   Benjamin     Craig       True
5       Cathy Werner   61.508991      Cathy    Werner      False
6       Brandon Bell   48.637309    Brandon      Bell      False
7        Paul Farley   12.389465       Paul    Farley      False
8     Sarah Mcdaniel  151.595447      Sarah  Mcdaniel       True
9  Caroline Anderson   75.621521   Caroline  Anderson      False

dataset2:

                  name       price first name  last name  above_100
0      William Garcia  258.180909    William     Garcia       True
1     Barbara Freeman  141.890534    Barbara    Freeman       True
2   Rebecca Zimmerman  293.373272    Rebecca  Zimmerman    

In [31]:
# Saving the processed files
dataset1.to_csv("processed_dataset1.csv", index=False)
dataset2.to_csv("processed_dataset2.csv", index=False)