# BIG DATA

## TASK 1

### 1.1

In [None]:
# IMPORT
import pandas as pd
import numpy as np

# LOAD DATA
data = pd.read_csv('iris_data.csv', delimiter=';')
labels = pd.read_csv('iris_labels.csv', delimiter=';')

In [None]:
# PRINT SAMPLES AND SHAPE INFORMATION
print(data)

### 1.2

In [None]:
# MERGE THE DATA SETS
data = pd.merge(data, labels, on="id", how="inner")

### 1.3

In [None]:
# REMOVE EXAMINER COLUMN/FEATURE
data.drop(['examiner'], axis=1, inplace=True)

### 1.4

In [None]:
# STORT BY SPECIES/NAME
data = data.sort_values('species')

### 1.5

In [None]:
# IMPORTS
import seaborn as sns

# PAIRPLOT
sns.pairplot(data, hue="species")


## Task 2

### 2.1

In [None]:
# DETERMINE CLASS DISTRIBUTION
print(data['species'].value_counts())

## Task 3

### 3.1

In [None]:
# FILTER OUT FLAGGED MISSING VALUES
data = data[data["sl"] != -9999]
data = data[data["sw"] != -9999]
data = data[data["pl"] != -9999]
data = data[data["pw"] != -9999]

In [None]:
sns.pairplot(data, hue="species")

### 3.2

In [None]:
# IMPORT
from scipy.stats import zscore

# DEFINE COLUMNS
feature_columns = ['pl', 'pw', 'sl', 'sw']

# CALCULATE Z-SCORE FOR EACH FEATURE
z_scores = data[feature_columns].apply(zscore)

# FILTER
outlier_mask = (abs(z_scores) > 3).any(axis=1)

# MASK
data = data[~outlier_mask]

In [None]:
# PAIR PLOT
sns.pairplot(data, hue="species")


## Task 4


### 4.1


In [None]:
# IMPORT
from sklearn.preprocessing import MinMaxScaler

# MINMAX SCALE
minmax_scaled = MinMaxScaler().fit_transform(data[["sl", "sw", "pl", "pw"]])


print(minmax_scaled[:, 0].mean())
print(minmax_scaled[:, 0].std())

### 4.2

In [None]:
# IMPORT
from sklearn.preprocessing import StandardScaler

# Z SCALE
sd_scaled = StandardScaler().fit_transform(data[["sl", "sw", "pl", "pw"]])
print(sd_scaled[:, 0].mean())
print(sd_scaled[:, 0].std())

### 4.3


In [None]:
# IMPORT
from sklearn.decomposition import PCA

# PCA SCALE
pca = PCA()
principal_components = pca.fit_transform(data[["sl", "sw", "pl", "pw"]])


In [None]:
# PRINT EXPLAINED VARIANCE FOR EACH COMPONENT
print("Explained variance ratio for each component:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i + 1}: {ratio:.4f}")

# CUM-SUM
cumsum_variance = pca.explained_variance_ratio_.cumsum()
print(f"\nCumulative explained variance:")
for i, cum_var in enumerate(cumsum_variance):
    print(f"PC{i + 1}: {cum_var:.4f}")

# FIND NUMBER OF COMPONENTS NEEDED FOR AT LEAST 95% variance
n_components_95 = (cumsum_variance >= 0.95).argmax() + 1
print(f"\nNumber of components needed to explain at least 95% of variance: {n_components_95}")

### 4.4

In [None]:
pd.DataFrame(pca.components_, columns=["Sepal L", "Sepal W",
                                       "Petal L", "Petal W"],
                                        index=['PC 1', 'PC 2', 'PC 3', 'PC 4']).abs().mean(axis=0)

### 4.5

In [None]:
# Copy dataset so original isn't changed
data_rescaled = data.copy()

# Rescale Petal Length (pl) to [0, 100]
scaler = MinMaxScaler(feature_range=(0, 100))
data_rescaled["pl"] = scaler.fit_transform(data_rescaled[["pl"]])

# Apply PCA again
features = ["sl", "sw", "pl", "pw"]
pca_rescaled = PCA()
pca_rescaled.fit(data_rescaled[features])

print("Explained variance ratio for each component:")
for i, ratio in enumerate(pca_rescaled.explained_variance_ratio_):
    print(f"PC{i + 1}: {ratio:.4f}")

# Create loadings DataFrame
contribution_rescaled = pd.DataFrame(
    pca_rescaled.components_,
    columns=features,
    index=["PC 1", "PC 2", "PC 3", "PC 4"]
)

# Average absolute contribution
attribute_contribution_rescaled = contribution_rescaled.abs().mean(axis=0)
print(attribute_contribution_rescaled)

### 4.6

In [None]:
# Copy dataset so original isn't changed
data_outlier = data.copy()

# Add an outlier: set petal length (pl) of first record to 5000
data_outlier.loc[0, "pl"] = 5000

# Apply PCA again
features = ["sl", "sw", "pl", "pw"]
pca_outlier = PCA()
pca_outlier.fit(data_outlier[features])

print("Explained variance ratio for each component:")
for i, ratio in enumerate(pca_outlier.explained_variance_ratio_):
    print(f"PC{i + 1}: {ratio:.4f}")

# Create loadings DataFrame
contribution_outlier = pd.DataFrame(
    pca_outlier.components_,
    columns=features,
    index=["PC 1", "PC 2", "PC 3", "PC 4"]
)

# Average absolute contribution
attribute_contribution_outlier = contribution_outlier.abs().mean(axis=0)
print(attribute_contribution_outlier)

## Task 5

### 5.1


In [None]:
sample = data.sample(n=150)

### 5.2

In [None]:
sample = data.sample(n=150, replace = True)

### 5.3

In [None]:
sample = data.groupby('species', group_keys=False).apply(lambda x: x.sample(frac=0.5))

### 5.4

In [None]:
sample = data.groupby('species', group_keys=False).apply( lambda x : x.sample(50))

TASK 1:

a. int64
b.