In [31]:
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np


In [23]:
data_file_path = 'welddb/welddb.data'
headers_file_path = 'welddb/headers.txt'
with open(headers_file_path, 'r') as f:
    headers = [line.strip() for line in f]
df = pd.read_csv(data_file_path, sep=r'\s+', header=None, names=headers)

In [4]:
df

Unnamed: 0,Carbon concentration (weight%),Silicon concentration (weight%),Manganese concentration (weight%),Sulphur concentration (weight%),Phosphorus concentration (weight%),Nickel concentration (weight%),Chromium concentration (weight%),Molybdenum concentration (weight%),Vanadium concentration (weight%),Copper concentration (weight%),...,Charpy temperature (Â°C),Charpy impact toughness (J),Hardness (kg/mmÂ²),50% FATT,Primary ferrite (%),Ferrite with second phase (%),Acicular ferrite (%),Martensite (%),Ferrite with carbide aggregate (%),Weld ID
0,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,-28,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.30,0.65,0.008,0.012,0,N,N,N,N,...,-38,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aht
3,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Baw
4,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,-48,100,N,N,32,28,40,0,0,Evans-Ni/CMn-1990/1991-0Bawch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,0.100,0.35,0.90,0.008,0.016,0.60,8.6,0.98,0.18,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX35
1648,0.088,0.36,0.88,0.008,0.017,0.57,8.4,0.94,0.19,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX36
1649,0.090,0.34,0.89,0.008,0.016,0.17,8.2,0.94,0.02,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX37
1650,0.092,0.35,0.90,0.008,0.016,0.54,8.4,0.97,0.17,N,...,N,N,N,N,N,N,N,N,N,Birmingham-MAX38


First, we will replace the N values with a nan variable

In [24]:
#Replace missing values with NaN
df.replace('N', np.nan, inplace=True)

Let's see the vartiables types so that we can convert the types of the variables to the right format

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carbon concentration (weight%)              1652 non-null   float64
 1   Silicon concentration (weight%)             1652 non-null   float64
 2   Manganese concentration (weight%)           1652 non-null   float64
 3   Sulphur concentration (weight%)             1648 non-null   object 
 4   Phosphorus concentration (weight%)          1642 non-null   object 
 5   Nickel concentration (weight%)              697 non-null    object 
 6   Chromium concentration (weight%)            784 non-null    object 
 7   Molybdenum concentration (weight%)          793 non-null    object 
 8   Vanadium concentration (weight%)            928 non-null    object 
 9   Copper concentration (weight%)              578 non-null    object 
 10  Cobalt conce

We should convert all te numerical variables to numerical type, but let's see this column

In [7]:
#see the 1197th row
df.loc[1197]

Carbon concentration (weight%)                           0.06
Silicon concentration (weight%)                          0.32
Manganese concentration (weight%)                        1.23
Sulphur concentration (weight%)                        <0.002
Phosphorus concentration (weight%)                      0.002
Nickel concentration (weight%)                           1.01
Chromium concentration (weight%)                          8.4
Molybdenum concentration (weight%)                       0.89
Vanadium concentration (weight%)                         0.22
Copper concentration (weight%)                          <0.01
Cobalt concentration (weight%)                          <0.01
Tungsten concentration (weight%)                          NaN
Oxygen concentration (ppm)                                380
Titanium concentration (ppm)                             <100
Nitrogen concentration (ppm)                               80
Aluminium concentration (ppm)                            <100
Boron co

Some lines that are expected to have numerical values may contain some non numerical one (like the '<0.002' value assigned to the variable 'Sulphur concentration (weight%)' for the 1197th row of the dataframe)

First, we will convert only possible rows then we will treat the rest

In [26]:
#Convert the variables to numeric if possible
df = df.apply(pd.to_numeric, errors='ignore')

  df = df.apply(pd.to_numeric, errors='ignore')


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carbon concentration (weight%)              1652 non-null   float64
 1   Silicon concentration (weight%)             1652 non-null   float64
 2   Manganese concentration (weight%)           1652 non-null   float64
 3   Sulphur concentration (weight%)             1648 non-null   object 
 4   Phosphorus concentration (weight%)          1642 non-null   float64
 5   Nickel concentration (weight%)              697 non-null    float64
 6   Chromium concentration (weight%)            784 non-null    float64
 7   Molybdenum concentration (weight%)          793 non-null    object 
 8   Vanadium concentration (weight%)            928 non-null    object 
 9   Copper concentration (weight%)              578 non-null    object 
 10  Cobalt conce

We can see that already 23 columns have no problems.
Now, we will deal with the other numerical columns that may contain some anomalies (like the '<' in the last example)

First, we will remove the '<' symbole from numerical variables

In [27]:
df.replace({r'<': '', }, regex=True, inplace=True)
df = df.apply(pd.to_numeric, errors='ignore')


  df = df.apply(pd.to_numeric, errors='ignore')


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carbon concentration (weight%)              1652 non-null   float64
 1   Silicon concentration (weight%)             1652 non-null   float64
 2   Manganese concentration (weight%)           1652 non-null   float64
 3   Sulphur concentration (weight%)             1648 non-null   float64
 4   Phosphorus concentration (weight%)          1642 non-null   float64
 5   Nickel concentration (weight%)              697 non-null    float64
 6   Chromium concentration (weight%)            784 non-null    float64
 7   Molybdenum concentration (weight%)          793 non-null    float64
 8   Vanadium concentration (weight%)            928 non-null    float64
 9   Copper concentration (weight%)              578 non-null    float64
 10  Cobalt conce

We still have 7 'object' type columns, let's explore them

In [28]:
object_columns = df.select_dtypes(include=['object']).columns

df[object_columns]

Unnamed: 0,Nitrogen concentration (ppm),AC or DC,Electrode polarity,Interpass temperature (Â°C),Weld type,Hardness (kg/mmÂ²),Weld ID
0,,DC,+,200,MMA,,Evans-Ni/CMn-1990/1991-0Aaw
1,,DC,+,200,MMA,,Evans-Ni/CMn-1990/1991-0Aawch
2,,DC,+,200,MMA,,Evans-Ni/CMn-1990/1991-0Aht
3,,DC,+,200,MMA,,Evans-Ni/CMn-1990/1991-0Baw
4,,DC,+,200,MMA,,Evans-Ni/CMn-1990/1991-0Bawch
...,...,...,...,...,...,...,...
1647,398,,+,200,SA,,Birmingham-MAX35
1648,394,,+,200,SA,,Birmingham-MAX36
1649,96,,+,200,SA,,Birmingham-MAX37
1650,99,,+,200,SA,,Birmingham-MAX38


We can see that the 3 columns 'Nitrogen concentration (ppm)', 'Interpass temperature (Â°C)' and 'Hardness (kg/mmÂ²)' should have numerical values.
Let's see what are the non-numerical values in these columns

In [29]:
df1 = df.copy()

In [43]:
list_of_numeric_columns = ['Nitrogen concentration (ppm)', 'Interpass temperature (Â°C)', 'Hardness (kg/mmÂ²)']


In [44]:
#see all the unique values in the these columns
for column in list_of_numeric_columns:
    print(column, df1[column].unique())
    print('\n')

Nitrogen concentration (ppm) [nan '72' '54' '57' '47' '44' '46' '68' '55' '53' '50' '48' '52' '89' '70'
 '41' '38' '80' '49' '77' '94' '65' '67' '58' '60' '460' '480' '160' '155'
 '67tot33res' '66totndres' '61tot34res' '54totndres' '54tot24res'
 '52tot18res' '50tot17res' '48tot18res' '78' '88' '75' '84' '85' '79' '76'
 '83' '92' '74' '86' '90' '110' '97' '99' '91' '105' '120' '150' '81' '87'
 '93' '102' '96' '66' '73' '71' '82' '145' '148' '164' '166' '235' '226'
 '243' '239' '253' '249' '240' '100' '143' '119' '539' '515' '494' '489'
 '552' '517' '520' '544' '526' '537' '529' '509' '523' '107' '114' '106'
 '117' '125' '95' '109' '64' '36' '63' '43' '39' '34' '69' '59' '37' '51'
 '56' '140' '190' '170' '124' '133' '108' '61' '122' '121' '131' '101'
 '103' '116' '138' '139' '62' '136' '540' '180' '450' '370' '250' '400'
 '430' '420' '410' '390' '260' '340' '26' '22' '21' '35' '127' '156' '245'
 '312' '266' '123' '236' '165' '113' '269' '45' '27' '42' '98' '373' '376'
 '416' '398' '394']

Let's adress the first column : 'Nitrogen concentration (ppm)'

In [49]:
# Find the non-numeric entries in 'Nitrogen concentration (ppm)'
df1['Nitrogen concentration (ppm)_numeric'] = pd.to_numeric(df1['Nitrogen concentration (ppm)'], errors='coerce')

# Identify rows where the conversion resulted in NaN (indicating non-numeric values)
problematic_entries = df1[df1['Nitrogen concentration (ppm)_numeric'].isna() & ~df1['Nitrogen concentration (ppm)'].isna()]['Nitrogen concentration (ppm)'].unique()

df1 = df1.drop(columns=['Nitrogen concentration (ppm)_numeric'])

problematic_entries  

array(['67tot33res', '66totndres', '61tot34res', '54totndres',
       '54tot24res', '52tot18res', '50tot17res', '48tot18res'],
      dtype=object)

In [47]:
problematic_entries

Unnamed: 0,Carbon concentration (weight%),Silicon concentration (weight%),Manganese concentration (weight%),Sulphur concentration (weight%),Phosphorus concentration (weight%),Nickel concentration (weight%),Chromium concentration (weight%),Molybdenum concentration (weight%),Vanadium concentration (weight%),Copper concentration (weight%),...,Charpy impact toughness (J),Hardness (kg/mmÂ²),50% FATT,Primary ferrite (%),Ferrite with second phase (%),Acicular ferrite (%),Martensite (%),Ferrite with carbide aggregate (%),Weld ID,Nitrogen concentration (ppm)_numeric
274,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,,,,,,,,,Evans-Al/CMn-1990-5aw,
275,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,28.0,,,,,,,,Evans-Al/CMn-1990-5awch1,
276,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,100.0,,,,,,,,Evans-Al/CMn-1990-5awch2,
277,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,,,,23.0,5.0,72.0,0.0,0.0,Evans-Al/CMn-1990-5aw,
278,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,,,,,,,,,Evans-Al/CMn-1990-5ht,
279,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,28.0,,,,,,,,Evans-Al/CMn-1990-5htch1,
280,0.069,0.3,1.36,0.007,0.009,,,,0.0005,,...,100.0,,,,,,,,Evans-Al/CMn-1990-5htch2,
281,0.073,0.33,1.39,0.007,0.008,,,,0.0005,,...,,,,,,,,,Evans-Al/CMn-1990-20aw,
282,0.073,0.33,1.39,0.007,0.008,,,,0.0005,,...,28.0,,,,,,,,Evans-Al/CMn-1990-20awch1,
283,0.073,0.33,1.39,0.007,0.008,,,,0.0005,,...,100.0,,,,,,,,Evans-Al/CMn-1990-20awch2,
