# Data Wrangling

This project will create a model that predicts the physical strength of a steel based on its alloying elements and temperature

## 1.0 Importing libraries and loading data

In [1]:
# Importing necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Loading both datasets
path = "/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/raw/steel.csv"
steel = pd.read_csv(path)

## 2.0 Exploring data

In [3]:
steel.head()

Unnamed: 0,Alloy code,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,V,Al,N,Ceq,Nb + Ta,Temperature (°C),0.2% Proof Stress (MPa),Tensile Strength (MPa),Elongation (%),Reduction in Area (%)
0,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,27,342,490,30,71
1,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,100,338,454,27,72
2,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,200,337,465,23,69
3,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,300,346,495,21,70
4,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,0.0,400,316,489,26,79


In [4]:
steel.columns

Index(['Alloy code', ' C', ' Si', ' Mn', ' P', ' S', ' Ni', ' Cr', ' Mo',
       ' Cu', 'V', ' Al', ' N', 'Ceq', 'Nb + Ta', ' Temperature (°C)',
       ' 0.2% Proof Stress (MPa)', ' Tensile Strength (MPa)',
       ' Elongation (%)', ' Reduction in Area (%)'],
      dtype='object')

In [5]:
steel.dtypes

Alloy code                   object
 C                          float64
 Si                         float64
 Mn                         float64
 P                          float64
 S                          float64
 Ni                         float64
 Cr                         float64
 Mo                         float64
 Cu                         float64
V                           float64
 Al                         float64
 N                          float64
Ceq                         float64
Nb + Ta                     float64
 Temperature (°C)             int64
 0.2% Proof Stress (MPa)      int64
 Tensile Strength (MPa)       int64
 Elongation (%)               int64
 Reduction in Area (%)        int64
dtype: object

In [6]:
steel.shape

(915, 20)

In [7]:
steel.describe()

Unnamed: 0,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,V,Al,N,Ceq,Nb + Ta,Temperature (°C),0.2% Proof Stress (MPa),Tensile Strength (MPa),Elongation (%),Reduction in Area (%)
count,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0,915.0
mean,0.174929,0.310918,0.812962,0.014543,0.010602,0.143016,0.427861,0.44287,0.079148,0.06019,0.012435,0.007652,0.093989,4.1e-05,351.601093,328.218579,496.248087,26.789071,70.206557
std,0.059674,0.086871,0.342775,0.005244,0.004024,0.172746,0.457568,0.394383,0.058481,0.096403,0.013067,0.002321,0.172533,0.000261,189.714773,131.653232,239.71065,8.806595,12.389289
min,0.09,0.18,0.42,0.006,0.003,0.0,0.0,0.005,0.0,0.0,0.002,0.0025,0.0,0.0,27.0,27.0,162.0,10.0,18.0
25%,0.13,0.24,0.5,0.01,0.008,0.023,0.04,0.05,0.04,0.0,0.004,0.0062,0.0,0.0,200.0,220.0,413.0,20.0,62.0
50%,0.16,0.3,0.68,0.014,0.01,0.05,0.11,0.5,0.07,0.0,0.006,0.0076,0.0,0.0,400.0,290.0,479.0,26.0,71.0
75%,0.2,0.37,1.21,0.018,0.012,0.21,1.0,0.56,0.11,0.07,0.016,0.009,0.0,0.0,500.0,432.5,575.0,31.0,80.0
max,0.34,0.52,1.48,0.03,0.022,0.6,1.31,1.35,0.25,0.3,0.05,0.015,0.437,0.0017,650.0,690.0,6661.0,78.0,94.0


There seem to be atleast 1 observation with a Tensile strength of 6661 MPa. This will be dropped in the section below

## 3.0 Data Manipulation

### 3.2 Renaming columns

For simplicity sake,the columns will be renamed. Yield strength is another name for 0.2% Proof Strength and will be named as such.

In [8]:
steel.columns = ['alloy_code','c', 'si', 'mn', 'p', 's', 'ni', 'cr', 'mo', 'cu', 'v', 'al', 'n',\
                 'ceq', 'nb+ta', 'temp', 'yield', 'tensile', 'elongation', 'red_area' ]
steel.columns

Index(['alloy_code', 'c', 'si', 'mn', 'p', 's', 'ni', 'cr', 'mo', 'cu', 'v',
       'al', 'n', 'ceq', 'nb+ta', 'temp', 'yield', 'tensile', 'elongation',
       'red_area'],
      dtype='object')

### 3.1 Dropping columns and observations

Columns such as alloy code aren't useful in this case. Ceq isn't useful either as all other alloying elements are known.

In [9]:
steel.shape

(915, 20)

In [10]:
steel = steel.drop_duplicates()

In [11]:
steel.shape

(915, 20)

In [12]:
steel = steel.drop(columns=['alloy_code', 'ceq'])

In [13]:
steel['temp'].value_counts()

27     95
100    95
200    95
300    95
400    95
450    95
500    95
550    95
600    76
350    45
650    30
150     2
250     2
Name: temp, dtype: int64

As it can be seen, the upper range of temperature reaches 650 degC which is not a practical temperature for the use of steel in typical engineering settings. Therefore, temperatures above 500 degC will be dropped.

In [14]:
steel = steel[steel['temp'] < 500]

In [15]:
steel['temp'].unique()

array([ 27, 100, 200, 300, 400, 450, 350, 150, 250])

#### The data seems to be free of null values, however there is the issue of the unusally high tensile strength observation

In [16]:
outlier = steel[steel['tensile']==6661]
outlier

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp,yield,tensile,elongation,red_area
626,0.18,0.37,0.66,0.018,0.017,0.1,1.07,1.03,0.12,0.3,0.02,0.0081,0.0,300,519,6661,11,28


In [17]:
# Dropping outlier
steel = steel.drop(index = outlier.index)

In [18]:
# Ensuring there are no other outliers
steel.sort_values(by='tensile', ascending=False).head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp,yield,tensile,elongation,red_area
391,0.29,0.2,0.75,0.01,0.009,0.34,1.0,1.25,0.14,0.26,0.002,0.0075,0.0,27,690,830,17,59
421,0.29,0.26,0.76,0.009,0.007,0.45,1.12,1.18,0.07,0.23,0.003,0.0103,0.0,27,685,830,17,62
431,0.29,0.26,0.77,0.009,0.007,0.46,1.12,1.2,0.08,0.23,0.004,0.0095,0.0,27,660,820,17,59
451,0.3,0.27,0.7,0.012,0.012,0.44,1.1,1.35,0.11,0.27,0.003,0.0082,0.0,27,655,810,18,62
381,0.28,0.18,0.75,0.012,0.009,0.32,1.0,1.25,0.14,0.27,0.002,0.009,0.0,27,665,810,18,63


## 4.0 Data Cleaning

In [19]:
# Finding NA values
steel.isna().sum()

c             0
si            0
mn            0
p             0
s             0
ni            0
cr            0
mo            0
cu            0
v             0
al            0
n             0
nb+ta         0
temp          0
yield         0
tensile       0
elongation    0
red_area      0
dtype: int64

In [20]:
steel.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp,yield,tensile,elongation,red_area
0,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,27,342,490,30,71
1,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,100,338,454,27,72
2,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,200,337,465,23,69
3,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,300,346,495,21,70
4,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,400,316,489,26,79


In [21]:
steel.shape

(618, 18)

#### The columns have been renamed, outliers have been dropped and observations at more reasonable temperatures have been selected. The dataset is now ready to be used in the next stage of analysis.

In [22]:
steel.to_csv('/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/raw/steel_clean.csv', index=False)