In [1]:
import urllib.request
import os

# Access the URL with the dataset
url_link = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/vehicle/"

# Defines a list of file names to be downloaded
data_names = ["xaa.dat", "xab.dat", "xac.dat", "xad.dat", "xae.dat", "xaf.dat", "xag.dat", "xah.dat", "xai.dat"]

# Initialize an empty list to store the data from each file
list_data = []

# Loop over each file and download its contents
for file_name in data_names:
    # Define the full URL for this file
    file_url = url_link + file_name

    # Download the file and append its contents to the list
    response = urllib.request.urlopen(file_url)
    data = response.read()
    list_data.append(data)

# Combine the data from all the files into a single string
combined_data = b"\n".join(list_data)

# Write the combined data to a file
with open("combined_data.dat", "wb") as f:
    f.write(combined_data)

In [2]:
import pandas as pd

# Read the combined.dat file into a Pandas DataFrame
df = pd.read_csv("combined_data.dat", sep="\s+", header=None)

# Define the names to count
names = ["van", "saab", "opel", "bus"]

# Count the number of occurrences of each name in the last column
count_name = df.iloc[:, -1].value_counts()

# Print the number of rows that each name appears in
for name in names:
    count = count_name.get(name, 0)
    print(f"{name}: {count} rows")

van: 199 rows
saab: 217 rows
opel: 212 rows
bus: 218 rows


In [3]:
# Read the combined.dat file into a DataFrame
df = pd.read_csv("combined_data.dat", delim_whitespace=True, header=None)

# Add column titles to the first 18 columns
df.columns = ["compactness", "circularity", "distance circularity", "radius ratio", "pr.axis_aspect_ratio", "max.length_aspect_ratio", 
              "scatter ratio", "elongatedness", "pr.axis_rectangularity", "max.length_rectangularity", "scaled_variance", "scaled_variance.1", "scaled_radius_of_gyration", "scaled_radius_of_gyration.1",
             "skewness_about","skewness_about.1","skewness_about.2","hollows_ratio","class"] + list(range(20, len(df.columns)+1))

# Display the resulting DataFrame
display(df)

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183,64,8,169,40,20,134,200,422,149,72,7,25,188,195,saab
842,89,46,84,163,66,11,159,43,20,159,173,368,176,72,1,20,186,197,van
843,106,54,101,222,67,12,222,30,25,173,228,721,200,70,3,4,187,201,saab
844,86,36,78,146,58,7,135,50,18,124,155,270,148,66,0,25,190,195,saab


In [4]:
import numpy as np

# Function to calculate the percentage of missing values
def percent_missing(df):
    missing_total = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (missing_total / len(df)) * 100
    return pd.concat([missing_total, missing_percent], axis=1, keys=['Total', 'Percent'])

# Function to display the data types and percentage of missing values in each feature
data_info = pd.concat([df.dtypes, percent_missing(df)], axis=1, sort=False)
data_info.columns = ['Data Type', 'Total Missing', 'Percent Missing']
print(data_info)


                            Data Type  Total Missing  Percent Missing
compactness                     int64              0              0.0
circularity                     int64              0              0.0
distance circularity            int64              0              0.0
radius ratio                    int64              0              0.0
pr.axis_aspect_ratio            int64              0              0.0
max.length_aspect_ratio         int64              0              0.0
scatter ratio                   int64              0              0.0
elongatedness                   int64              0              0.0
pr.axis_rectangularity          int64              0              0.0
max.length_rectangularity       int64              0              0.0
scaled_variance                 int64              0              0.0
scaled_variance.1               int64              0              0.0
scaled_radius_of_gyration       int64              0              0.0
scaled_radius_of_gyr

In [5]:
# Function to select the first 18 columns of the dataframe
cols = df.iloc[:, :18].columns

# Function to identify the type of noise
def identify_noise(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    upper_fence = q3 + 1.5*iqr
    lower_fence = q1 - 1.5*iqr
    mean = df[col].mean()
    std_dev = df[col].std()
    noise = []
    
    for value in df[col]:
        if pd.isna(value):
            noise.append('Missing')
        elif value > upper_fence or value < lower_fence:
            noise.append('Outlier')
        elif value > mean + 3*std_dev or value < mean - 3*std_dev:
            noise.append('Extreme')
        elif value > mean + 2*std_dev or value < mean - 2*std_dev:
            noise.append('Severe')
        elif value > mean + std_dev or value < mean - std_dev:
            noise.append('Moderate')
        else:
            noise.append('None')
    
    return noise

# Creates a dictionary to store the results
results_dict = {}

# Loop through each feature in the dataset and identify the type of noise
for col in cols:
    noise = identify_noise(df, col)
    results_dict[col] = noise

# Converting the dictionary to a dataframe and displaying the results
results_df = pd.DataFrame.from_dict(results_dict)
results_df.columns = [col + ' Noise' for col in results_df.columns]

results_df.to_csv("noise_data.dat", sep=" ", index=False, header=False)

#print(results_df.head())

In [6]:
# Loop through each feature in the dataset and remove the outliers
for col in cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    upper_fence = q3 + 1.5*iqr
    lower_fence = q1 - 1.5*iqr
    
    df = df[(df[col] >= lower_fence) & (df[col] <= upper_fence)]

df.to_csv("cleaned_data.dat", sep=" ", index=False, header=False)
