In [1]:
import pandas as pd
import seaborn as sns

In [2]:
diamonds = sns.load_dataset("diamonds")

In [3]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
diamonds.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

This chapter outcome:
- Convert to string
- Convert to numeric
- Convert to category

To convert data type from one to another, we use __astype()__

We can only use astype() on __Series__

Therefore, select __a column__ that want to be converted with astype() function

# 1. Convert to String

In [5]:
# Change color column from category to string
diamonds['color'] = diamonds['color'].astype(str)

In [6]:
diamonds.dtypes

carat       float64
cut        category
color        object
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

As you can see, now the cut_str column is __object type__

# 2. Convert to Numeric

- Convert to numeric can also be used with astype()
- The problem arise when a column with number data in __string__ contain __missing values__

Example

In [7]:
diamonds_sub_missing = diamonds.copy()

In [8]:
# Change carat dtype into 'str'
diamonds_sub_missing['carat'] = diamonds_sub_missing['carat'].astype('str')

In [9]:
# DOuble check dtypes all columns
diamonds_sub_missing.dtypes

carat        object
cut        category
color        object
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [10]:
diamonds_sub_missing.loc[[1,4,9,13,15], 'carat'] = 'missing'

In [11]:
diamonds_sub_missing.head(20)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,missing,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,missing,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,missing,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [12]:
# As you can see, astype() cannot convert 'missing' into numeric. Thus it throws error
diamonds_sub_missing['carat'].astype('float')

ValueError: could not convert string to float: 'missing'

To solve this problem, we will be using __pd.to_numeric ( data column , errors = ' coerce ' )__

In [13]:
diamonds_sub_missing['carat'] = pd.to_numeric(diamonds_sub_missing['carat'], errors='coerce')

In [14]:
# Now you can see the carat dtype is back to float
diamonds_sub_missing.dtypes

carat       float64
cut        category
color        object
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [15]:
# The 'missing' is converted into NaN
diamonds_sub_missing.head(20)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


# 3. Convert to Category

Convert data into category when:
- data takes on limited, and usually fixed, number of possible values 
    - examples are gender, social class, blood type, country affiliation, observation time or rating via Likert scales
- data might have an order (e.g. ‘strongly agree’ vs ‘agree’ or ‘first observation’ vs. ‘second observation’)
- data takes a lot of memory

__Convert to category are NOT SO IMPORTANT, so don't think too much about it__

In [16]:
diamonds['color'] = diamonds['color'].astype('category')

In [17]:
diamonds.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object