# Semi-Supevised Learning

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Numpy arrays are used to store training and test data.
import numpy as np

# Pandas is used to manipulate tabular data.
import pandas as pd

#Seaborn is used for plotting relevant representations of the data we're handling
import seaborn as sns

# Matplotlib is used to plot graphs.
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt
# Style options for plots.
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998).
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
welddb_ssl = pd.read_csv('welddb/welddb_yuxian.csv')

In [3]:
welddb_ssl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Carbon concentration (weight%)             1652 non-null   float64
 1   Silicon concentration (weight%)            1652 non-null   float64
 2   Manganese concentration (weight%)          1652 non-null   float64
 3   Sulphur concentration (weight%)            1652 non-null   float64
 4   Phosphorus concentration (weight%)         1652 non-null   float64
 5   Nickel concentration (weight%)             1652 non-null   float64
 6   Chromium concentration (weight%)           1652 non-null   float64
 7   Molybdenum concentration (weight%)         1652 non-null   float64
 8   Vanadium concentration (weight%)           1652 non-null   float64
 9   Copper concentration (weight%)             1652 non-null   float64
 10  Cobalt concentration (we

In [4]:
welddb_ssl.shape

(1652, 44)

In [5]:
# Drop the targets more suitable for supervised learning
columns_to_drop = welddb_ssl.columns[30:36]

welddb_ssl = welddb_ssl.drop(columns=columns_to_drop)

In [8]:
# Drop the 'Current (A)' and 'Voltage (V)' columns
welddb_ssl = welddb_ssl.drop(columns=welddb_ssl.columns[[21, 22]])

In [9]:
welddb_ssl.shape

(1652, 36)

In [10]:
welddb_ssl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 36 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Carbon concentration (weight%)             1652 non-null   float64
 1   Silicon concentration (weight%)            1652 non-null   float64
 2   Manganese concentration (weight%)          1652 non-null   float64
 3   Sulphur concentration (weight%)            1652 non-null   float64
 4   Phosphorus concentration (weight%)         1652 non-null   float64
 5   Nickel concentration (weight%)             1652 non-null   float64
 6   Chromium concentration (weight%)           1652 non-null   float64
 7   Molybdenum concentration (weight%)         1652 non-null   float64
 8   Vanadium concentration (weight%)           1652 non-null   float64
 9   Copper concentration (weight%)             1652 non-null   float64
 10  Cobalt concentration (we

## Hardness Prediction

In [11]:
def select_target(df, target_column, target_columns):
    """
    Function to temporarily keep only the selected target column and features.
    
    Parameters:
    df (pd.DataFrame): The dataframe to modify.
    target_column (str): The target column to keep.
    target_columns (list): List of all potential target columns.
    
    Returns:
    pd.DataFrame: A new dataframe with the selected target and other columns intact.
    """
    columns_to_exclude = [col for col in target_columns if col != target_column]
    
    return df.drop(columns=columns_to_exclude)

target_columns = [
    'Hardness (kgmm-2)', '50 % FATT', 'Primary ferrite in microstructure (%)',
    'Ferrite with second phase (%)', 'Acicular ferrite (%)',
    'Martensite (%)', 'Ferrite with carbide aggregate (%)'
]

target_column = 'Hardness (kgmm-2)'
welddb_ssl_selected = select_target(welddb_ssl, target_column, target_columns)


In [13]:
welddb_ssl_selected.shape

(1652, 30)

In [14]:
welddb_ssl_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 30 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Carbon concentration (weight%)             1652 non-null   float64
 1   Silicon concentration (weight%)            1652 non-null   float64
 2   Manganese concentration (weight%)          1652 non-null   float64
 3   Sulphur concentration (weight%)            1652 non-null   float64
 4   Phosphorus concentration (weight%)         1652 non-null   float64
 5   Nickel concentration (weight%)             1652 non-null   float64
 6   Chromium concentration (weight%)           1652 non-null   float64
 7   Molybdenum concentration (weight%)         1652 non-null   float64
 8   Vanadium concentration (weight%)           1652 non-null   float64
 9   Copper concentration (weight%)             1652 non-null   float64
 10  Cobalt concentration (we

### Remove the instances with missing values

In [18]:
def drop_missing_except_target(df, target_column):
    """
    Drops instances with missing values in all columns except the target column.
    
    Parameters:
    df (pd.DataFrame): The dataframe to clean.
    target_column (str): The target column to exclude from the NaN drop process.
    
    Returns:
    pd.DataFrame: A new dataframe with rows dropped where NaN values exist, except in the target column.
    """
    columns_except_target = df.columns[df.columns != target_column]
    df_cleaned = df.dropna(subset=columns_except_target)
    
    return df_cleaned

welddb_ssl_cleaned = drop_missing_except_target(welddb_ssl_selected, target_column)

In [20]:
welddb_ssl_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 0 to 1436
Data columns (total 30 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Carbon concentration (weight%)             1307 non-null   float64
 1   Silicon concentration (weight%)            1307 non-null   float64
 2   Manganese concentration (weight%)          1307 non-null   float64
 3   Sulphur concentration (weight%)            1307 non-null   float64
 4   Phosphorus concentration (weight%)         1307 non-null   float64
 5   Nickel concentration (weight%)             1307 non-null   float64
 6   Chromium concentration (weight%)           1307 non-null   float64
 7   Molybdenum concentration (weight%)         1307 non-null   float64
 8   Vanadium concentration (weight%)           1307 non-null   float64
 9   Copper concentration (weight%)             1307 non-null   float64
 10  Cobalt concentration (weight%

### Get dummy variables for the categorical features

In [21]:
import pandas as pd

def dummy_variables(df, target_column, categorical_columns):
    """
    Prepares the feature matrix X and target vector y from a dataframe by creating dummy variables.
    
    Parameters:
    df (pd.DataFrame): The input dataframe.
    target_column (str): The name of the target column.
    categorical_columns (list): List of categorical columns to convert to dummy variables.
    
    Returns:
    X (pd.DataFrame): The feature matrix with dummy variables.
    y (pd.Series): The target variable.
    """
    dummies = pd.get_dummies(df[categorical_columns], drop_first=True)

    y = df[target_column]

    X_ = df.drop(categorical_columns + [target_column], axis=1).astype('float64')
    X = pd.concat([X_, dummies], axis=1)
    
    return X, y

categorical_columns = ['AC or DC', 'Electrode positive or negative', 'Type of weld']

X, y = dummy_variables(welddb_ssl_cleaned, target_column, categorical_columns)


In [26]:
X.shape

(1307, 36)

In [28]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 0 to 1436
Data columns (total 36 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Carbon concentration (weight%)             1307 non-null   float64
 1   Silicon concentration (weight%)            1307 non-null   float64
 2   Manganese concentration (weight%)          1307 non-null   float64
 3   Sulphur concentration (weight%)            1307 non-null   float64
 4   Phosphorus concentration (weight%)         1307 non-null   float64
 5   Nickel concentration (weight%)             1307 non-null   float64
 6   Chromium concentration (weight%)           1307 non-null   float64
 7   Molybdenum concentration (weight%)         1307 non-null   float64
 8   Vanadium concentration (weight%)           1307 non-null   float64
 9   Copper concentration (weight%)             1307 non-null   float64
 10  Cobalt concentration (weight%

In [27]:
y.shape

(1307,)