In [9]:
#Task A
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the wine dataset from the uploaded file
wine_data = pd.read_csv('Wine.csv')

# Dropping the 'Type' column as it's categorical and not used in PCA
wine_features = wine_data.drop(columns=['Type'])

# Standardizing the data
scaler = StandardScaler()
wine_features_scaled = scaler.fit_transform(wine_features)

# Performing PCA
pca = PCA()
pca.fit(wine_features_scaled)

# Extracting the explained variance, proportion of variance, and cumulative proportion
explained_variance = pca.explained_variance_
proportion_variance = pca.explained_variance_ratio_
cumulative_proportion = proportion_variance.cumsum()

# Creating a DataFrame to display the results for Task A
pca_results = pd.DataFrame({
    'Standard Deviation': pca.singular_values_,
    'Proportion of Variance': proportion_variance,
    'Cumulative Proportion': cumulative_proportion
})

# Displaying the first 10 rows of the results for Task A
pca_results.head(10)

# Displaying the results for Task A
print("Task A - PCA Results:")
print(pca_results)

# Task B: Determine the number of principal components required to explain >80% of the variance
num_components_80 = (cumulative_proportion >= 0.80).argmax() + 1
print("\nTask B - Number of components to explain >80% of variance:", num_components_80)

# Task C: Explanation of why PC1 accounts for the most variance
print("\nTask C - Explanation:")
print("The first principal component (PC1) captures the maximum variance as PCA is designed to maximize variance "
      "along the first component. It represents the dominant pattern in the data, hence accounting for the most variance.")

# Task D: Display weights of the first five principal components on original characteristics
loadings = pd.DataFrame(pca.components_[:5].T, columns=[f'PC{i+1}' for i in range(5)], index=wine_features.columns)
print("\nTask D - Weights (loadings) of the first five principal components:")
print(loadings)

# Task E: Identify high-loading and low-loading characteristics for PC1
pc1_loadings = loadings['PC1']
high_loading_characteristics = pc1_loadings[pc1_loadings > 0.3].index.tolist()
low_loading_characteristics = pc1_loadings[pc1_loadings < -0.3].index.tolist()

print("\nTask E - High-loading characteristics on PC1:", high_loading_characteristics)
print("Task E - Low-loading characteristics on PC1:", low_loading_characteristics)

Task A - PCA Results:
    Standard Deviation  Proportion of Variance  Cumulative Proportion
0            28.942034                0.361988               0.361988
1            21.082251                0.192075               0.554063
2            16.043716                0.111236               0.665300
3            12.789736                0.070690               0.735990
4            12.323742                0.065633               0.801623
5            10.687140                0.049358               0.850981
6             9.903688                0.042387               0.893368
7             7.876073                0.026807               0.920175
8             7.170818                0.022222               0.942397
9             6.682862                0.019300               0.961697
10            6.339588                0.017368               0.979066
11            5.480976                0.012982               0.992048
12            4.289670                0.007952               1.00000