# Import dependencies

In [None]:
import numpy as np

# Getting data
Get dataset

In [None]:
# Set file path
path_to_dataset = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

# Load data from a text file, with missing values handled as specified.
data_set = np.genfromtxt(
    path_to_dataset,
    dtype='float',
    delimiter=',',
)

In [None]:
print(f"Size: {data_set.size}; shape: {data_set.shape}; dtype: {data_set.dtype}")

# Tasks
Find target column in dataset

In [None]:
# Get target column as one which doesn't have numerical values
target_columns = np.any(
    np.isnan(data_set),
    axis=0,
)

# Get index for target column
target_index = np.where(target_columns == True)

Get feature set from dataset

In [None]:
# Drop target column to get feature dataset
if target_index:
  feature_set = np.delete(
      arr=data_set,
      obj=target_index,
      axis=1,
  )
else:
  feature_set = data_set

Check if feature set has type np.ndarray

In [None]:
if type(feature_set) != np.ndarray:
  raise Exception("Error: feature set hasn't type np.ndarray")
else:
  print("Featureset has type np.ndarray")

Get mean value for the 1st column

In [None]:
mean_for_first_column = np.mean(
    a=feature_set[:, 0],
)

f"Mean value for the 1st column is {mean_for_first_column:.2f}"

Get median value for the 1st column

In [None]:
median_for_first_column = np.median(
    a=feature_set[:, 0],
)

f"Median value for the 1st column is {median_for_first_column:.2f}"

Get standard deviation for the 1st column

In [None]:
stdev_for_first_column = np.std(
    a=feature_set[:, 0],
)

f"Standard deviation for the 1st column is {stdev_for_first_column:.2f}"

Substitute 20 NaNs to random positions in set with no repetitions

In [None]:
nan_amount = 20
random_positions = set()
updated_feature_set = np.copy(feature_set)

rows, cols = feature_set.shape

# Get random positions
while True:
  random_row = np.random.choice(rows)
  random_col = np.random.choice(cols)

  random_positions.add((random_row, random_col))
  if len(random_positions) == 20:
    break

# Update items in dataset with nan in defined random positions
for item in random_positions:
    updated_feature_set[item[0], item[1]] = np.nan

# Get number of nans in updated feature_set
number_of_nans = len(
    np.where(np.isnan(updated_feature_set))[0]
)
print(f"{number_of_nans} items in feature_set substituted by NaN")

Get positions of NaNs in 1st column

In [None]:
nan_positions = np.where(np.isnan(updated_feature_set[:, 0]))

print("Positions of nan in 1st column are:\n")

for item in nan_positions[0]:
  print(item)

Filter updated featureset by condition

In [None]:
# Set condition to filter dataset
condition = (updated_feature_set[:, 2] > 1.5) & (updated_feature_set[:, 0] < 5.0)

# Filter dataset
filtered_feature_set = updated_feature_set[condition]

filtered_feature_set

Change all NaNs to 0 in updated dataset

In [None]:
# Substitute nans by 0
np.nan_to_num(
    x=updated_feature_set,
    nan=0,
    copy=False
)

Count unique items and return them with their counts

In [None]:
# Get unique items and their counts
unique_items = np.unique(
    ar=updated_feature_set,
    return_counts='True',
)

# Count unique items
number_of_unique = len(unique_items[0])

print(f"Number of unique items is {number_of_unique}\n")

# Return unique items with their count
for item, count in zip(unique_items[0], unique_items[1]):
  print(f"Item {item} appears {count} times")

Split array to 2 parts vertically

In [None]:
num_splits = 2

# Split the array vertically along axis 0
first_subarray, second_subarray = np.array_split(
    ary=updated_feature_set,
    indices_or_sections=num_splits,
    axis=0
)

print(f"First subarray is of shape: {first_subarray.shape}\nSecond subarray is of shape: {second_subarray.shape}")

Sort sub-arrays

In [None]:
# Set axis to sort along with
axis_to_sort = 0

# Sort first sub-array in ascending order
sorted_first_part = np.sort(
    a=first_subarray,
    axis=axis_to_sort,
)

# Sort first sub-array in descending order
sorted_second_part = np.sort(
    a=second_subarray,
    axis=axis_to_sort
)[::-1]

Concatenate sub-arrays

In [None]:
concatenated_array = np.concatenate(
    (sorted_first_part, sorted_second_part),
    axis=axis_to_sort,
)

concatenated_array.shape

Get most common item

In [None]:
# Get unique items
unique_elements, counts = np.unique(
    ar=concatenated_array,
    return_counts=True,
)

# Get index of element with maximum count
max_count_index = np.argmax(counts)

# Get the element with the maximum count
item_with_max_count = unique_elements[max_count_index]

item_with_max_count

Set function to multiply items of defined column conditionally

In [None]:
# Get array and process elements in defined column
def process_array(processed_array, processed_column):
    mean_value = np.mean(a=processed_array[:, processed_column])
    
    print(f" Mean value: {mean_value:.2f}")

    processed_array[:, processed_column] = np.where(
        processed_array[:, processed_column] < mean_value,
        processed_array[:, processed_column] / 2,
        processed_array[:, processed_column] / 4
    )

    return processed_array

Process 3rd column of array

In [None]:
processed_array = process_array(concatenated_array, 2)

processed_array