## Brainstorming
#### Analysis Questions 
- What was the original size of the data set chosen, and was it expanded to meet the requirement of 5,000 observations (5,000 rows)? 
- What method was used to expand the data set to contain 5,000 rows: (AI was used based on the trend of the data set) 
- How did this expansion method affect the data set?

#### Data Quality Assessment
- **First**: Assess the non-number, NaN, values
    - Remove NaN values using `.dropna()`
        - When removing all the NaN values, only 44 rows remain
    - Need to find all NaN values with `.isna()`, which returns a boolean mask
- Check data types using `.info()`
    - displays: Column names, types, non-null counts, memory
- Check statistical description using `.describe()`
- Use a random function to randomly extract 7500 rows/observations from the dataset
- Use `.value_counts()` for frequency count of each category
- Convert all NaN to random number values based on the mean of each column

In [32]:
import numpy as np
import pandas as pd

# loading Dataset from a CSV file #
Eng_raw = pd.read_csv("data/engineering_messy_dataset.csv")

# Eng_raw = Eng_raw.replace('??', np.nan)
# Eng_cleaning_qm => Engineering, cleaning question marks
Eng_clean = Eng_raw.dropna()

# pd.to_datetime(Eng_raw['Last_Maintenance'])

# Eng_Repaired_count = Eng_raw.query('Repaired == "Y" or Repaired == "yEs" or Repaired == "No" or Repaired == "N" or Repaired == "Yes" or Repaired == "yes" or Repaired == "no"').count()
# Eng_Repaired_count = Eng_Repaired_count[['Repaired']]

print(f"\n{Eng_raw['Repaired'].unique()}\n")
Eng_raw["Repaired"] = (
    Eng_raw["Repaired"].str.lower().replace({"yes": "Yes", "y": "Yes", "no": "No", "n": "No"})
)
print(f"\n{Eng_raw['Repaired'].unique()}\n")
print(f"\n{Eng_raw['Technician'].unique()}\n")
Eng_raw["Technician"] = Eng_raw["Technician"].replace({"j@mes": "James"})
print(f"\n{Eng_raw['Technician'].unique()}\n")
print(f"\n{Eng_raw['Status'].unique()}\n")
Eng_raw["Status"] = Eng_raw["Status"].replace(
    {"Inactive": "inactive", "ACTIVE": "active", "Active": "active"}
)
print(f"\n{Eng_raw['Status'].unique()}\n")

with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    Eng_raw_unique = Eng_raw[["Name", "Material", "Notes", "Remarks", "Comments"]].apply(
        lambda x: x.unique()
    )
    print(f"\n{Eng_raw_unique}")

Eng_raw["Name"] = Eng_raw["Name"].replace(
    {"gear_shaft": "gear shaft", "pipe_section": "pipe section"}
)
print(f"\n{Eng_raw['Name'].unique()}\n")
Eng_raw["Material"] = Eng_raw["Material"].str.lower().replace({"alum.": "aluminium"})
print(f"\n{Eng_raw['Material'].unique()}\n")
Eng_raw["Name"] = Eng_raw["Name"].replace(
    {"gear_shaft": "gear shaft", "pipe_section": "pipe section"}
)

# Extracting 7500 rows from dataset #
# Eng_cleanSize = Eng_clean.sample(n=7500, random_state=1)
# print(f"\n{Eng_raw.dtypes}\n")
# print(f"\n{Eng_raw.isna()}\n")

print(f"\n{Eng_raw.info()}\n")
print(f"{Eng_raw.describe()}\n")
print(f"Shape/dimensions of DataFrame: {Eng_raw.shape}\n")
print(f"Number of Elements: {Eng_raw.size}\n")

# Eng_raw.head(20)
print(f"Shape/dimensions of DataFrame: {Eng_clean.shape}\n")
Eng_raw


['Y' 'yEs' 'No' 'N' nan 'Yes' 'yes' '??' 'no']


['Yes' 'No' nan '??']


['Mike' 'Paul' 'Tomi' 'j@mes' 'Ada' 'Ola' 'John' 'Janet' 'Jane' nan
 'James']


['Mike' 'Paul' 'Tomi' 'James' 'Ada' 'Ola' 'John' 'Janet' 'Jane' nan]


['faulty' 'active' 'Inactive' 'inactive' 'ACTIVE' 'Active' nan 'repair']


['faulty' 'active' 'inactive' nan 'repair']


Name        [nozzle, piston, valve, rotor, pump, bolt, joint, filter, bearing, spring, gear_shaft, pipe_section, axle, cylinder]
Material                                               [??, Titanium, Aluminium, Brass, steel, Copper, Iron, Alum., iron, Steel]
Notes             [good cond., broken, rust forming, cleaned, slight rust, dirty, crack on edge, loose fitting, nan, minor dent]
Remarks                                                                   [fine, weak, good, bad, none, ok, replace soon, !, ??]
Comments                                                         [replaced, fine, delayed, ok, done, nan, urgent, pending, none]
dtype: ob

Unnamed: 0,Component_ID,Name,Material,Weight(kg),Length(cm),Width(cm),Temp(C),Pressure(bar),Status,Last_Maintenance,Technician,Efficiency(%),Notes,Cost(₦),Batch,Remarks,Production_Date,Fault_Code,Repaired,Comments
0,CMP00001,nozzle,??,,146,,,,faulty,10/18/2024,Mike,??,good cond.,,B3,fine,,F007,Yes,replaced
1,CMP00002,piston,titanium,??,,,xx,11.6,active,11/28/2024,Paul,??,broken,12157,B1,weak,10/13/2024,F005,Yes,fine
2,CMP00003,nozzle,aluminium,33,96,,,,inactive,11/19/2024,Tomi,,broken,59545,B6,good,,F007,No,delayed
3,CMP00004,valve,brass,,,27.0,,,inactive,,James,,rust forming,??,,bad,,F001,Yes,fine
4,CMP00005,rotor,steel,7.45,,,??,11.54,inactive,11/3/2024,James,??,cleaned,,,none,,F004,No,ok
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,CMP49996,gear shaft,??,5.17,,53.0,,,active,,Ada,85.3,dirty,37216,B1,weak,,F005,No,urgent
49996,CMP49997,spring,steel,5.49,,59.0,78.7,,active,,Paul,??,dirty,,B6,!,,F002,Yes,replaced
49997,CMP49998,bearing,copper,8.29,71,,??,9.72,inactive,2/1/2024,Tomi,,cleaned,,B2,??,,F004,??,ok
49998,CMP49999,gear shaft,??,3.81,??,,,,inactive,,Paul,??,good cond.,14258,B4,fine,,F004,No,none
