# WELDDB DATASET - PRINCIPAL COMPONENT ANALYSIS

This notebook performs a comprehensive PCA analysis on the welddb dataset to understand feature importance and dimensionality reduction opportunities.

## 1. Import Libraries and Setup

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## 2. Define Column Names and Load Data

In [None]:
# Define column names based on welddb.info
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

column_names = [
    'carbon_wt_pct', 'silicon_wt_pct', 'manganese_wt_pct', 'sulphur_wt_pct', 'phosphorus_wt_pct',
    'nickel_wt_pct', 'chromium_wt_pct', 'molybdenum_wt_pct', 'vanadium_wt_pct', 'copper_wt_pct',
    'cobalt_wt_pct', 'tungsten_wt_pct', 'oxygen_ppm', 'titanium_ppm', 'nitrogen_ppm', 'aluminium_ppm',
    'boron_ppm', 'niobium_ppm', 'tin_ppm', 'arsenic_ppm', 'antimony_ppm', 'current_a', 'voltage_v',
    'ac_dc', 'electrode_polarity', 'heat_input_kj_per_mm', 'interpass_temp_c', 'weld_type',
    'pwht_temp_c', 'pwht_time_h', 'yield_strength_mpa', 'ultimate_tensile_strength_mpa',
    'elongation_pct', 'reduction_area_pct', 'charpy_temp_c', 'charpy_toughness_j',
    'hardness_kg_per_mm2', 'fatt50_pct', 'primary_ferrite_pct', 'ferrite_second_phase_pct',
    'acicular_ferrite_pct', 'martensite_pct', 'ferrite_carbide_aggregate_pct', 'weld_id'
]

# Load the data
print("Loading data from welddb/welddb.data...")
data = pd.read_csv('welddb/welddb.data', sep=r'\s+', names=column_names, na_values='N')
print(f"Dataset shape: {data.shape}")
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")

Loading data from welddb/welddb.data...
Dataset shape: (1652, 44)
Number of samples: 1652
Number of features: 44


## 3. Data Cleaning and Preprocessing

In [15]:
# Handle non-numeric values (e.g., '<0.002' should be treated as a small number)
def clean_numeric(val):
    """Convert strings like '<0.002' to numeric values"""
    if pd.isna(val):
        return np.nan
    if isinstance(val, str):
        if val.startswith('<'):
            return float(val[1:]) * 0.5  # Use half the detection limit
        elif val.startswith('>'):
            return float(val[1:]) * 1.5  # Use 1.5x the upper limit
    return val

# Separate features and potential targets
print("Identifying feature columns and potential target columns...")

# Features: columns 0-29 (chemical composition and welding parameters)
numeric_feature_cols = column_names[:30]
# Identify categorical features
categorical_features = ['ac_dc', 'electrode_polarity', 'weld_type']
# Numeric features (exclude categoricals)
numeric_feature_cols = [col for col in numeric_feature_cols if col not in categorical_features]

# Potential target variables
target_cols = {
    'Mechanical Properties': ['yield_strength_mpa', 'ultimate_tensile_strength_mpa',
                              'elongation_pct', 'reduction_area_pct'],
    'Impact Toughness': ['charpy_temp_c', 'charpy_toughness_j',
                        'hardness_kg_per_mm2', 'fatt50_pct'],
    'Microstructure': ['primary_ferrite_pct', 'ferrite_second_phase_pct',
                      'acicular_ferrite_pct', 'martensite_pct',
                      'ferrite_carbide_aggregate_pct']
}

# Get all target column names
all_target_cols = []
for targets in target_cols.values():
    all_target_cols.extend(targets)

# Apply cleaning to all numeric columns (features and targets)
for col in numeric_feature_cols + all_target_cols:
    if col in data.columns:
        data[col] = data[col].apply(clean_numeric)
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Convert categorical features to dummy variables
print("\nConverting categorical features to dummy variables...")
categorical_dummies = []
for cat_col in categorical_features:
    if cat_col in data.columns:
        # Create dummy variables, drop first to avoid multicollinearity
        dummies = pd.get_dummies(data[cat_col], prefix=cat_col, drop_first=True, dtype=float)
        categorical_dummies.append(dummies)
        print(f"  {cat_col}: {len(dummies.columns)} dummy variable(s) created")

# Combine all dummies into one DataFrame
if categorical_dummies:
    categorical_df = pd.concat(categorical_dummies, axis=1)
    dummy_feature_names = categorical_df.columns.tolist()
else:
    categorical_df = pd.DataFrame()
    dummy_feature_names = []

print(f"\nNumeric features ({len(numeric_feature_cols)}):")
print(f"{numeric_feature_cols}")
print(f"\nCategorical dummy features ({len(dummy_feature_names)}):")
print(f"{dummy_feature_names}")
print(f"\nPotential target variables:")
for category, targets in target_cols.items():
    print(f"  - {category}: {targets}")

Identifying feature columns and potential target columns...

Converting categorical features to dummy variables...
  ac_dc: 1 dummy variable(s) created
  electrode_polarity: 2 dummy variable(s) created
  weld_type: 9 dummy variable(s) created

Numeric features (27):
['carbon_wt_pct', 'silicon_wt_pct', 'manganese_wt_pct', 'sulphur_wt_pct', 'phosphorus_wt_pct', 'nickel_wt_pct', 'chromium_wt_pct', 'molybdenum_wt_pct', 'vanadium_wt_pct', 'copper_wt_pct', 'cobalt_wt_pct', 'tungsten_wt_pct', 'oxygen_ppm', 'titanium_ppm', 'nitrogen_ppm', 'aluminium_ppm', 'boron_ppm', 'niobium_ppm', 'tin_ppm', 'arsenic_ppm', 'antimony_ppm', 'current_a', 'voltage_v', 'heat_input_kj_per_mm', 'interpass_temp_c', 'pwht_temp_c', 'pwht_time_h']

Categorical dummy features (12):
['ac_dc_DC', 'electrode_polarity_-', 'electrode_polarity_0', 'weld_type_GMAA', 'weld_type_GTAA', 'weld_type_MMA', 'weld_type_NGGMA', 'weld_type_NGSAW', 'weld_type_SA', 'weld_type_SAA', 'weld_type_ShMA', 'weld_type_TSA']

Potential target vari

## 4. Data Quality Analysis

In [None]:
print("Analyzing data quality...")

# Analyze numeric features
X_numeric = data[numeric_feature_cols].copy()
print(f"\nMissing values per numeric feature:")
missing_counts = X_numeric.isnull().sum()
missing_pct = (missing_counts / len(X_numeric) * 100).round(2)
missing_df = pd.DataFrame({
    'Feature': missing_counts.index,
    'Missing Count': missing_counts.values,
    'Missing %': missing_pct.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
print(missing_df.to_string(index=False))

# Check categorical features
print(f"\nCategorical features analysis:")
for cat_col in categorical_features:
    if cat_col in data.columns:
        n_missing = data[cat_col].isnull().sum()
        pct_missing = (n_missing / len(data) * 100)
        unique_vals = data[cat_col].dropna().unique()
        print(f"  {cat_col}: {n_missing} missing ({pct_missing:.2f}%), {len(unique_vals)} unique values: {list(unique_vals)}")

Analyzing data quality...

Missing values per numeric feature:
          Feature  Missing Count  Missing %
  tungsten_wt_pct           1577      95.46
    cobalt_wt_pct           1523      92.19
      arsenic_ppm           1418      85.84
     antimony_ppm           1392      84.26
          tin_ppm           1356      82.08
        boron_ppm           1148      69.49
    copper_wt_pct           1074      65.01
    nickel_wt_pct            955      57.81
      niobium_ppm            900      54.48
  chromium_wt_pct            868      52.54
molybdenum_wt_pct            859      52.00
    aluminium_ppm            747      45.22
  vanadium_wt_pct            724      43.83
     titanium_ppm            717      43.40
     nitrogen_ppm            469      28.39
       oxygen_ppm            396      23.97
        current_a            248      15.01
        voltage_v            248      15.01
 interpass_temp_c             38       2.30
      pwht_temp_c             13       0.79
      pwht_ti

In [17]:
# Remove numeric features with too many missing values (>50%)
threshold = 50
features_to_keep = missing_pct[missing_pct <= threshold].index.tolist()
features_removed = [col for col in numeric_feature_cols if col not in features_to_keep]
print(f"Removing {len(features_removed)} features with >{threshold}% missing values:")
print(f"{features_removed}")

X_numeric = X_numeric[features_to_keep]
print(f"\nRemaining numeric features: {len(features_to_keep)}")
print(f"Categorical dummy features: {len(dummy_feature_names)}")
print(f"Total features for PCA: {len(features_to_keep) + len(dummy_feature_names)}")

Removing 11 features with >50% missing values:
['nickel_wt_pct', 'chromium_wt_pct', 'molybdenum_wt_pct', 'copper_wt_pct', 'cobalt_wt_pct', 'tungsten_wt_pct', 'boron_ppm', 'niobium_ppm', 'tin_ppm', 'arsenic_ppm', 'antimony_ppm']

Remaining numeric features: 16
Categorical dummy features: 12
Total features for PCA: 28


## 5. Imputation and Standardization

In [18]:
# Impute remaining missing values with median for numeric features
print("Imputing missing values with median strategy...")
imputer = SimpleImputer(strategy='median')
X_numeric_imputed = imputer.fit_transform(X_numeric)
X_numeric_imputed = pd.DataFrame(X_numeric_imputed, columns=features_to_keep, index=X_numeric.index)
print(f"Numeric imputation complete. No missing values: {X_numeric_imputed.isnull().sum().sum() == 0}")

# Combine numeric and categorical features
print("\nCombining numeric and categorical features...")
if len(dummy_feature_names) > 0:
    # Align indices
    categorical_df_aligned = categorical_df.loc[X_numeric_imputed.index]
    X_imputed = pd.concat([X_numeric_imputed, categorical_df_aligned], axis=1)
    all_feature_names = features_to_keep + dummy_feature_names
else:
    X_imputed = X_numeric_imputed
    all_feature_names = features_to_keep

print(f"Combined feature matrix shape: {X_imputed.shape}")
print(f"Total features: {len(all_feature_names)}")

# Display basic statistics
print("\nFeature statistics (first 20 features):")
print(X_imputed.iloc[:, :20].describe().T.round(4))

Imputing missing values with median strategy...
Numeric imputation complete. No missing values: True

Combining numeric and categorical features...
Combined feature matrix shape: (1652, 28)
Total features: 28

Feature statistics (first 20 features):
                       count      mean       std      min       25%      50%  \
carbon_wt_pct         1652.0    0.0755    0.0239    0.029    0.0618    0.074   
silicon_wt_pct        1652.0    0.3286    0.1125    0.040    0.2700    0.320   
manganese_wt_pct      1652.0    1.2028    0.3821    0.270    0.9400    1.270   
sulphur_wt_pct        1652.0    0.0095    0.0112    0.001    0.0060    0.007   
phosphorus_wt_pct     1652.0    0.0129    0.0196    0.002    0.0070    0.010   
vanadium_wt_pct       1652.0    0.0431    0.1941    0.000    0.0024    0.005   
oxygen_ppm            1652.0  437.4207  128.8406  132.000  399.0000  423.000   
titanium_ppm          1652.0   62.3042   75.2513    0.000   38.0000   42.000   
nitrogen_ppm          1652.0  

In [19]:
# Standardize features for PCA
print("Standardizing features (zero mean, unit variance)...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
print("Standardization complete.")

Standardizing features (zero mean, unit variance)...
Standardization complete.


## 6. PCA

In [20]:
# Perform PCA
print("Performing Principal Component Analysis...")
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance analysis
explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

print("\nExplained variance by principal components:")
pca_summary = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(min(10, len(explained_var)))],
    'Explained Variance': explained_var[:10],
    'Explained Variance %': (explained_var[:10] * 100).round(2),
    'Cumulative %': (cumulative_var[:10] * 100).round(2)
})
print(pca_summary.to_string(index=False))

# Find number of components for 90% and 95% variance
n_components_90 = np.argmax(cumulative_var >= 0.90) + 1
n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
print(f"\nNumber of components explaining 90% variance: {n_components_90}")
print(f"Number of components explaining 95% variance: {n_components_95}")

Performing Principal Component Analysis...

Explained variance by principal components:
  PC  Explained Variance  Explained Variance %  Cumulative %
 PC1            0.156391                 15.64         15.64
 PC2            0.099832                  9.98         25.62
 PC3            0.078958                  7.90         33.52
 PC4            0.061636                  6.16         39.68
 PC5            0.052064                  5.21         44.89
 PC6            0.048151                  4.82         49.70
 PC7            0.042102                  4.21         53.91
 PC8            0.041455                  4.15         58.06
 PC9            0.038457                  3.85         61.90
PC10            0.037272                  3.73         65.63

Number of components explaining 90% variance: 18
Number of components explaining 95% variance: 21


## 7. Feature Contributions to Principal Components

In [21]:
# Analyze feature contributions to principal components
print("Feature contributions to principal components:")
n_top_components = 5
n_top_features = 10

for i in range(n_top_components):
    print(f"\nPrincipal Component {i+1} (explains {explained_var[i]*100:.2f}% variance):")
    # Get absolute loadings
    loadings = pca.components_[i]
    feature_importance = pd.DataFrame({
        'Feature': all_feature_names,
        'Loading': loadings,
        'Abs_Loading': np.abs(loadings)
    }).sort_values('Abs_Loading', ascending=False)

    print(f"Top {n_top_features} contributing features:")
    for idx, row in feature_importance.head(n_top_features).iterrows():
        sign = '+' if row['Loading'] > 0 else '-'
        print(f"  {sign} {row['Feature']:30s}: {row['Abs_Loading']:.4f}")

Feature contributions to principal components:

Principal Component 1 (explains 15.64% variance):
Top 10 contributing features:
  + current_a                     : 0.4502
  + heat_input_kj_per_mm          : 0.4262
  - weld_type_MMA                 : 0.3964
  + voltage_v                     : 0.3668
  + weld_type_SA                  : 0.3066
  + weld_type_TSA                 : 0.2741
  + carbon_wt_pct                 : 0.2048
  + sulphur_wt_pct                : 0.1756
  + phosphorus_wt_pct             : 0.1410
  + interpass_temp_c              : 0.1392

Principal Component 2 (explains 9.98% variance):
Top 10 contributing features:
  + pwht_temp_c                   : 0.4131
  - silicon_wt_pct                : 0.3512
  + pwht_time_h                   : 0.3310
  - titanium_ppm                  : 0.2995
  - manganese_wt_pct              : 0.2917
  + nitrogen_ppm                  : 0.2871
  - aluminium_ppm                 : 0.2808
  + oxygen_ppm                    : 0.2034
  - ac_dc_DC      

## 8. Target Variable Analysis

In [22]:
print("Analyzing potential target variables:")
for category, targets in target_cols.items():
    print(f"\n{category}:")
    available_targets = [t for t in targets if t in data.columns]
    for target in available_targets:
        target_data = data[target]
        n_valid = target_data.notna().sum()
        pct_valid = (n_valid / len(data) * 100)
        if n_valid > 0:
            mean_val = target_data.mean()
            std_val = target_data.std()
            print(f"  {target:20s}: {n_valid:4d} samples ({pct_valid:5.1f}%), "
                  f"mean={mean_val:8.2f}, std={std_val:8.2f}")
        else:
            print(f"  {target:20s}: No data available")

Analyzing potential target variables:

Mechanical Properties:
  yield_strength_mpa  :  780 samples ( 47.2%), mean=  508.56, std=   92.87
  ultimate_tensile_strength_mpa:  738 samples ( 44.7%), mean=  594.39, std=   88.64
  elongation_pct      :  700 samples ( 42.4%), mean=   26.28, std=    4.90
  reduction_area_pct  :  705 samples ( 42.7%), mean=   71.80, std=    8.93

Impact Toughness:
  charpy_temp_c       :  879 samples ( 53.2%), mean=  -34.61, std=   34.74
  charpy_toughness_j  :  879 samples ( 53.2%), mean=   87.69, std=   50.12
  hardness_kg_per_mm2 :   80 samples (  4.8%), mean=  215.70, std=   24.66
  fatt50_pct          :   31 samples (  1.9%), mean=  -31.10, std=   43.64

Microstructure:
  primary_ferrite_pct :   98 samples (  5.9%), mean=   19.16, std=   11.00
  ferrite_second_phase_pct:   90 samples (  5.4%), mean=   25.96, std=   21.28
  acicular_ferrite_pct:   90 samples (  5.4%), mean=   52.83, std=   23.48
  martensite_pct      :   89 samples (  5.4%), mean=    0.34, st