# Data Wrangling

## 1.0 Importing packages and data

In [1]:
import pandas as pd
import numpy as np
import io
import os

In [None]:
np.random.seed(132)

In [2]:
# Images of each defect are stored in seperate folders
# Consolidating them into single DataFrame

# General path
path = '/Users/chinmayasukumar/Documents/Springboard/Capstone-3_Classifying_Steel_Defects/data/raw/'

# Folder of each defect type
folder = ['Crazing', 'Inclusions', 'Patches', 'Pitted', 'Rolled', 'Scratches']

# Creating interim DataFrame for images of each defect type prior to consolidation
df = pd.DataFrame()
super_df = pd.DataFrame()

# Looping through image folders and adding them to "super_df"
for i in folder:
    filenames = os.listdir(path+i)
    df = pd.DataFrame({'Filename': filenames, 'Type':i})
    super_df = pd.concat([super_df, df])

In [3]:
super_df.head(2)

Unnamed: 0,Filename,Type
0,Cr_87.bmp,Crazing
1,Cr_93.bmp,Crazing


In [4]:
# Exracting the image number
super_df['Number'] = super_df.Filename.str.extract(r'(\d+)')

In [5]:
super_df.head()

Unnamed: 0,Filename,Type,Number
0,Cr_87.bmp,Crazing,87
1,Cr_93.bmp,Crazing,93
2,Cr_78.bmp,Crazing,78
3,Cr_238.bmp,Crazing,238
4,Cr_44.bmp,Crazing,44


In [6]:
super_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1804 entries, 0 to 299
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  1804 non-null   object
 1   Type      1804 non-null   object
 2   Number    1800 non-null   object
dtypes: object(3)
memory usage: 56.4+ KB


In [7]:
super_df.describe()

Unnamed: 0,Filename,Type,Number
count,1804,1804,1800
unique,1801,6,300
top,.DS_Store,Crazing,87
freq,4,301,6


In [8]:
super_df = super_df.sort_values(by=['Type','Number'])

In [9]:
# Checking for consistency within defect types
types = super_df['Type'].unique()

for i in types:
    print(i + ':', super_df[super_df['Type'] == i]['Filename'].nunique())

Crazing: 301
Inclusions: 301
Patches: 301
Pitted: 300
Rolled: 301
Scratches: 300


## 1.2 Removing irrelevant data

In [10]:
# Checking for non-image files
super_df[~super_df['Filename'].str.endswith('.bmp')]

Unnamed: 0,Filename,Type,Number
46,.DS_Store,Crazing,
50,.DS_Store,Inclusions,
45,.DS_Store,Patches,
46,.DS_Store,Rolled,


In [11]:
# Collecting all image paths only
super_df = super_df[super_df['Filename'].str.endswith('.bmp')]

In [12]:
super_df.describe()

Unnamed: 0,Filename,Type,Number
count,1800,1800,1800
unique,1800,6,300
top,Cr_1.bmp,Crazing,1
freq,1,300,6


In [13]:
super_df.isnull().sum()

Filename    0
Type        0
Number      0
dtype: int64

#### Data is clean and ready for EDA!

In [14]:
super_df.head()

Unnamed: 0,Filename,Type,Number
220,Cr_1.bmp,Crazing,1
178,Cr_10.bmp,Crazing,10
135,Cr_100.bmp,Crazing,100
138,Cr_101.bmp,Crazing,101
121,Cr_102.bmp,Crazing,102


## 1.3 Exporting

In [15]:
super_df.to_csv('/Users/chinmayasukumar/Documents/Springboard/Capstone-3_Classifying_Steel_Defects/data/interim/data.csv'\
               ,index=False)