In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import itertools
from imblearn.over_sampling import SMOTE

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
data = pd.read_csv('/content/drive/My Drive/Demoz-files/Copy-of-demewez-datasets-formated.csv')
data.head()

Unnamed: 0,MFoC,MT,OQ,ND,D,DT,C,Si,Mn,Cr,...,CQ,PQ,MQ,SMC,AMP,ShT,MS,Cmpt,SiS,SoD
0,Cast-Iron,G-25,4,3,1,Shrinkage,3.0,2.0,0.7,0,...,Good,Low,Low,0.5,1200,10,Good,35,1st-clammed,Moderate
1,Cast-Iron,G-25,20,15,5,Porosity,3.0,2.0,0.7,0,...,Excellent,Medium,Medium,0.3,1200,15,Good,55,2nd-clammed,Severe
2,Bronze,Bronze,40,27,13,Porosity,0.0,0.0,0.0,0,...,weak,Medium,Low,0.5,1000,120,Excellent,40,1st-clammed,Severe
3,Steel,High-Mn-Steel,8,7,1,Porosity,1.0,0.8,12.0,0,...,Excellent,Medium,Medium,0.2,1400,130,Good,36,1st-clammed,Minor
4,Steel,High-Mn-Steel,8,7,3,Other-Defects,1.0,0.8,12.0,0,...,Good,Low,Medium,0.5,1200,10,Good,37,2nd-clammed,Severe


In [10]:
print("Total Data Size: ", data.shape)

Total Data Size:  (1001, 37)


In [13]:
def convert_non_numeric_to_numeric(dataframe):
    # Initialize a LabelEncoder object
    label_encoder = LabelEncoder()

    # Iterate through each column in the dataframe
    for column in dataframe.columns:
        # Check if the column datatype is not numeric
        if dataframe[column].dtype not in ['int64', 'float64']:
            # Fit label encoder and transform values
            dataframe[column] = label_encoder.fit_transform(dataframe[column])

    return dataframe

In [16]:
# Convert non-numeric columns to numeric
data_numeric = convert_non_numeric_to_numeric(data)

data.head()

Unnamed: 0,MFoC,MT,OQ,ND,D,DT,C,Si,Mn,Cr,...,CQ,PQ,MQ,SMC,AMP,ShT,MS,Cmpt,SiS,SoD
0,2,2,4,3,1,3,3.0,2.0,0.7,0,...,1,1,1,0.5,1200,10,1,35,0,1
1,2,2,20,15,5,2,3.0,2.0,0.7,0,...,0,2,2,0.3,1200,15,1,55,1,2
2,0,0,40,27,13,2,0.0,0.0,0.0,0,...,2,2,1,0.5,1000,120,0,40,0,2
3,3,5,8,7,1,2,1.0,0.8,12.0,0,...,0,2,2,0.2,1400,130,1,36,0,0
4,3,5,8,7,3,1,1.0,0.8,12.0,0,...,1,1,2,0.5,1200,10,1,37,1,2


In [17]:
def calculate_target_correlation(dataframe, target_column):
    # Calculate the correlation matrix
    corr_matrix = dataframe.corr()

    # Extract correlations with the target column
    target_correlation = corr_matrix[target_column].drop(target_column)

    return target_correlation

In [19]:
target_column_1 = 'DT'
target_correlation = calculate_target_correlation(data_numeric, target_column_1)
print("Correlation with the Defect Type '{}':".format(target_column_1))
print(target_correlation)


Correlation with the Defect Type 'DT':
MFoC    0.013807
MT      0.049564
OQ      0.040595
ND      0.012125
D       0.203024
C      -0.026470
Si     -0.087665
Mn     -0.060362
Cr     -0.001820
P       0.002871
S      -0.029120
Cu     -0.009792
Sn     -0.010668
GS      0.020544
BC      0.030573
GP      0.049457
CaT     0.037555
CoR    -0.053519
CTe    -0.060645
PR     -0.090323
PT     -0.024303
SoT    -0.013563
SGSD    0.057927
ST      0.025012
SRR    -0.009989
SF     -0.024285
CQ     -0.063732
PQ      0.008810
MQ      0.007044
SMC     0.026717
AMP    -0.035572
ShT    -0.025408
MS     -0.036637
Cmpt   -0.004222
SiS    -0.003410
SoD     0.383058
Name: DT, dtype: float64


In [20]:
# Separate features and target variable
features = data_numeric.drop(columns=['DT'])
target = data_numeric['DT']

# Apply SMOTE
smote = SMOTE(random_state=42)
features_resampled, target_resampled = smote.fit_resample(features, target)

# Combine resampled features and target into a DataFrame
data_resampled = pd.concat([pd.DataFrame(features_resampled, columns=features.columns), pd.Series(target_resampled, name='DT')], axis=1)


In [21]:
data_resampled.shape

(1500, 37)

In [22]:
data_resampled.columns

Index(['MFoC', 'MT', 'OQ', 'ND', 'D', 'C', 'Si', 'Mn', 'Cr', 'P', 'S', 'Cu',
       'Sn', 'GS', 'BC', 'GP', 'CaT', 'CoR', 'CTe', 'PR', 'PT', 'SoT', 'SGSD',
       'ST', 'SRR', 'SF', 'CQ', 'PQ', 'MQ', 'SMC', 'AMP', 'ShT', 'MS', 'Cmpt',
       'SiS', 'SoD', 'DT'],
      dtype='object')

In [23]:
len(data_resampled.columns)

37

In [24]:
columns_to_remove = ['Cr', 'P', 'Cu', 'SRR', 'PQ', 'MQ', 'Cmpt', 'SiS']

data.drop(columns=columns_to_remove, inplace=True)
data.shape

(1001, 29)

In [26]:
data.head()

Unnamed: 0,MFoC,MT,OQ,ND,D,DT,C,Si,Mn,S,...,SoT,SGSD,ST,SF,CQ,SMC,AMP,ShT,MS,SoD
0,2,2,4,3,1,3,3.0,2.0,0.7,0.15,...,20,2,40,2,1,0.5,1200,10,1,1
1,2,2,20,15,5,2,3.0,2.0,0.7,0.15,...,25,2,45,0,0,0.3,1200,15,1,2
2,0,0,40,27,13,2,0.0,0.0,0.0,0.0,...,30,2,40,2,2,0.5,1000,120,0,2
3,3,5,8,7,1,2,1.0,0.8,12.0,0.06,...,45,2,38,0,0,0.2,1400,130,1,0
4,3,5,8,7,3,1,1.0,0.8,12.0,0.06,...,36,1,40,0,1,0.5,1200,10,1,2


In [27]:
# Save the modified DataFrame to a new CSV file
new_file_name = 'modified_dataset_for_shape.csv'
data.to_csv(new_file_name, index=False)

In [28]:
from google.colab import files

files.download(new_file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>