## Faulty Steel Plate Data Load into Databricks 

Overview

In this notebook, I have saved the original csv file that I'd uploaded to my Databricks File System (DBFS). Then, I created a queryable table inside of a schema called faulty steel plate (FSP) schema. 

This is basically the prep step to upload the data before I train the model in the next notebook. One of the main advantages of generating a table inside of Databricks is that the resultant table would be persistant against cluster restarts and would be accessable to other users as well.

In [0]:
# File location and filetype
file_location = "/FileStore/tables/portfolio/faultysteelplates/faults.csv"
file_type = "csv"

# Options for bringing in the csv data
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# Bringing in the data into a spark dataframe
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
42,50,270900,270944,267,17,44,24220,76,108,1687,1,0,80,0.0498,0.2415,0.1818,0.0047,0.4706,1.0,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
645,651,2538079,2538108,108,10,30,11397,84,123,1687,1,0,80,0.7647,0.3793,0.2069,0.0036,0.6,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
829,835,1553913,1553931,71,8,19,7972,99,125,1623,1,0,100,0.971,0.3426,0.3333,0.0037,0.75,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
853,860,369370,369415,176,13,45,18996,99,126,1353,0,1,290,0.7287,0.4413,0.1556,0.0052,0.5385,1.0,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
1289,1306,498078,498335,2409,60,260,246930,37,126,1353,0,1,185,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0,1,0,0,0,0,0,0
430,441,100250,100337,630,20,87,62357,64,127,1387,0,1,40,0.62,0.3417,0.1264,0.0079,0.55,1.0,1.0,2.7993,1.0414,1.9395,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0
413,446,138468,138883,9052,230,432,1481991,23,199,1687,0,1,150,0.4896,0.339,0.0795,0.0196,0.1435,0.9607,1.0,3.9567,1.5185,2.6181,0.9205,0.2791,1.0,1,0,0,0,0,0,0
190,200,210936,210956,132,11,20,20007,124,172,1687,0,1,150,0.2253,0.34,0.5,0.0059,0.9091,1.0,1.0,2.1206,1.0,1.301,0.5,0.1841,0.3359,1,0,0,0,0,0,0
330,343,429227,429253,264,15,26,29748,53,148,1687,0,1,150,0.3912,0.2189,0.5,0.0077,0.8667,1.0,1.0,2.4216,1.1139,1.415,0.5,-0.1197,0.5593,1,0,0,0,0,0,0
74,90,779144,779308,1506,46,167,180215,53,143,1687,0,1,150,0.0877,0.4261,0.0976,0.0095,0.3478,0.982,1.0,3.1778,1.2041,2.2148,0.9024,-0.0651,1.0,1,0,0,0,0,0,0


In [0]:
%sql

-- Creating schema named FSP which is short for faulty steel plates
create schema FSP

In [0]:
# Creating table with table name faults
permanent_table_name = "FSP.faults"

df.write.format("parquet").saveAsTable(permanent_table_name)