In [2]:
# Read diamonds_ece219.csv

import pandas as pd

# Read the CSV file
df = pd.read_csv('diamonds_ece219.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,color,clarity,carat,cut,symmetry,polish,depth_percent,table_percent,length,width,depth,girdle_min,girdle_max,price
0,0,E,VVS2,0.09,Excellent,Very Good,Very Good,62.7,59.0,2.85,2.87,1.79,M,M,200
1,1,E,VVS2,0.09,Very Good,Very Good,Very Good,61.9,59.0,2.84,2.89,1.78,STK,STK,200
2,2,E,VVS2,0.09,Excellent,Very Good,Very Good,61.1,59.0,2.88,2.9,1.77,TN,M,200
3,3,E,VVS2,0.09,Excellent,Very Good,Very Good,62.0,59.0,2.86,2.88,1.78,M,STK,200
4,4,E,VVS2,0.09,Very Good,Very Good,Excellent,64.9,58.5,2.79,2.83,1.82,STK,STK,200


In [3]:
# print the column names
print(df.columns)

Index(['Unnamed: 0', 'color', 'clarity', 'carat', 'cut', 'symmetry', 'polish',
       'depth_percent', 'table_percent', 'length', 'width', 'depth',
       'girdle_min', 'girdle_max', 'price'],
      dtype='object')


## Feature Engineering

### Handling Categorical Features

In [4]:
# figure out the categorical variables
categorical = df.select_dtypes(include=['object'])
print(categorical.head())


  color clarity        cut   symmetry     polish girdle_min girdle_max
0     E    VVS2  Excellent  Very Good  Very Good          M          M
1     E    VVS2  Very Good  Very Good  Very Good        STK        STK
2     E    VVS2  Excellent  Very Good  Very Good         TN          M
3     E    VVS2  Excellent  Very Good  Very Good          M        STK
4     E    VVS2  Very Good  Very Good  Excellent        STK        STK


In [8]:
# Analyze each categorical variable to determine encoding method
encoding_recommendations = {}

for column in categorical.columns:
    unique_values = categorical[column].unique()
    value_count = len(unique_values)
    
    # Print column info
    print(f"\n{column}:")
    print(f"  - Unique values ({value_count}): {sorted(unique_values)}")
    
    # Check for ordinal structure in common diamond attributes
    if column == 'color':
        # Diamond color grades are ordered from D (best) to Z (worst)
        recommendation = "Ordinal encoding - natural ordering from D (colorless) to Z (yellow)"
    elif column == 'clarity':
        # Clarity has natural ordering
        recommendation = "Ordinal encoding - natural ordering from highest clarity to lowest"
    elif column in ['cut', 'symmetry', 'polish']:
        # These typically have quality grades with natural ordering
        recommendation = "Ordinal encoding - natural ordering from highest to lowest quality"
    elif 'girdle' in column:
        # Girdle measurements have many categories without clear numerical relationship
        recommendation = "One-hot encoding - multiple categories without simple linear relationship"
    else:
        # Default recommendation based on cardinality
        if value_count <= 10:
            recommendation = "One-hot encoding - low cardinality"
        else:
            recommendation = "Consider ordinal encoding or dimension reduction techniques"
    
    encoding_recommendations[column] = recommendation
    print(f"  - Recommendation: {recommendation}")


color:
  - Unique values (10): ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']
  - Recommendation: Ordinal encoding - natural ordering from D (colorless) to Z (yellow)

clarity:
  - Unique values (10): ['I1', 'I2', 'I3', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']
  - Recommendation: Ordinal encoding - natural ordering from highest clarity to lowest

cut:
  - Unique values (2): ['Excellent', 'Very Good']
  - Recommendation: Ordinal encoding - natural ordering from highest to lowest quality

symmetry:
  - Unique values (2): ['Excellent', 'Very Good']
  - Recommendation: Ordinal encoding - natural ordering from highest to lowest quality

polish:
  - Unique values (2): ['Excellent', 'Very Good']
  - Recommendation: Ordinal encoding - natural ordering from highest to lowest quality

girdle_min:
  - Unique values (10): ['M', 'STK', 'STN', 'TK', 'TN', 'VTK', 'VTN', 'XTK', 'XTN', 'unknown']
  - Recommendation: One-hot encoding - multiple categories without simple linear relationship

In [9]:
# Apply encoding recommendations to categorical features

# 1. Ordinal encoding for features with natural ordering
# Color: D (colorless/best) to Z (yellow/worst)
color_order = ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
df['color_encoded'] = df['color'].map({color: i for i, color in enumerate(color_order)})

# Clarity: highest clarity to lowest
clarity_order = ['FL', 'IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1', 'I2', 'I3']
df['clarity_encoded'] = df['clarity'].map({clarity: i for i, clarity in enumerate(clarity_order)})

# Cut, Polish, Symmetry: highest to lowest quality
quality_order = ['Excellent', 'Very Good', 'Good', 'Fair', 'Poor']
df['cut_encoded'] = df['cut'].map({quality: i for i, quality in enumerate(quality_order)})
df['polish_encoded'] = df['polish'].map({quality: i for i, quality in enumerate(quality_order)})
df['symmetry_encoded'] = df['symmetry'].map({quality: i for i, quality in enumerate(quality_order)})

# 2. One-hot encoding for girdle_min and girdle_max
girdle_min_encoded = pd.get_dummies(df['girdle_min'], prefix='girdle_min')
girdle_max_encoded = pd.get_dummies(df['girdle_max'], prefix='girdle_max')

# Concatenate encoded features with original dataframe
df_encoded = pd.concat([df, girdle_min_encoded, girdle_max_encoded], axis=1)

# Show the first few rows of the encoded dataframe
df_encoded.head()

Unnamed: 0.1,Unnamed: 0,color,clarity,carat,cut,symmetry,polish,depth_percent,table_percent,length,...,girdle_max_M,girdle_max_STK,girdle_max_STN,girdle_max_TK,girdle_max_TN,girdle_max_VTK,girdle_max_VTN,girdle_max_XTK,girdle_max_XTN,girdle_max_unknown
0,0,E,VVS2,0.09,Excellent,Very Good,Very Good,62.7,59.0,2.85,...,True,False,False,False,False,False,False,False,False,False
1,1,E,VVS2,0.09,Very Good,Very Good,Very Good,61.9,59.0,2.84,...,False,True,False,False,False,False,False,False,False,False
2,2,E,VVS2,0.09,Excellent,Very Good,Very Good,61.1,59.0,2.88,...,True,False,False,False,False,False,False,False,False,False
3,3,E,VVS2,0.09,Excellent,Very Good,Very Good,62.0,59.0,2.86,...,False,True,False,False,False,False,False,False,False,False
4,4,E,VVS2,0.09,Very Good,Very Good,Excellent,64.9,58.5,2.79,...,False,True,False,False,False,False,False,False,False,False


In [10]:
# save the encoded dataframe to a new CSV file
df_encoded.to_csv('diamonds_encoded.csv', index=False)

In [1]:
print('test')

test
