In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the actual dataset
df = pd.read_csv("/content/Au_nanoparticle_dataset.csv")

print("Original dataset shape:", df.shape)
print("Original dataset columns:", df.columns.tolist())

# Task 2.1: Filter relevant columns
new_df = df[["N_total", "N_bulk", "N_surface", "R_avg"]].copy()

print("\nTask 2.1: New dataframe with filtered columns")
print("New dataframe shape:", new_df.shape)
print("New dataframe columns:", new_df.columns.tolist())

Original dataset shape: (4000, 185)
Original dataset columns: ['ID', 'T', 'tau', 'time', 'N_total', 'N_bulk', 'N_surface', 'Volume', 'R_min', 'R_max', 'R_diff', 'R_avg', 'R_std', 'R_skew', 'R_kurt', 'S_100', 'S_111', 'S_110', 'S_311', 'Curve_1-10', 'Curve_11-20', 'Curve_21-30', 'Curve_31-40', 'Curve_41-50', 'Curve_51-60', 'Curve_61-70', 'Curve_71-80', 'Curve_81-90', 'Curve_91-100', 'Curve_101-110', 'Curve_111-120', 'Curve_121-130', 'Curve_131-140', 'Curve_141-150', 'Curve_151-160', 'Curve_161-170', 'Curve_171-180', 'Avg_total', 'Avg_bulk', 'Avg_surf', 'TCN_0', 'TCN_1', 'TCN_2', 'TCN_3', 'TCN_4', 'TCN_5', 'TCN_6', 'TCN_7', 'TCN_8', 'TCN_9', 'TCN_10', 'TCN_11', 'TCN_12', 'TCN_13', 'TCN_14', 'TCN_15', 'TCN_16', 'TCN_17', 'TCN_18', 'TCN_19', 'TCN_20', 'BCN_0', 'BCN_1', 'BCN_2', 'BCN_3', 'BCN_4', 'BCN_5', 'BCN_6', 'BCN_7', 'BCN_8', 'BCN_9', 'BCN_10', 'BCN_11', 'BCN_12', 'BCN_13', 'BCN_14', 'BCN_15', 'BCN_16', 'BCN_17', 'BCN_18', 'BCN_19', 'BCN_20', 'SCN_0', 'SCN_1', 'SCN_2', 'SCN_3', 'SCN_4

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Task 2.2: Show first 20 samples
print("\nTask 2.2: First 20 samples of the filtered dataframe")
print(new_df.head(20))


Task 2.2: First 20 samples of the filtered dataframe
    N_total  N_bulk  N_surface    R_avg
0      1599    1014        585  17.3706
1      1642    1034        608  17.6061
2      4637    3365       1272  25.3692
3      7189    5292       1897  29.7011
4     11004    8508       2496  34.2831
5     13375   10768       2607  36.6334
6     13795   11155       2640  37.0108
7     13947   11304       2643  37.1672
8     14020   11357       2663  37.2103
9     14056   11389       2667  37.2467
10     1534     966        568  17.1107
11     1559     974        585  17.2101
12     4356    3099       1257  24.8253
13     6550    4675       1875  29.0376
14    10175    7748       2427  33.4399
15    12393    9842       2551  35.6950
16    13207   10590       2617  36.4657
17    13543   10937       2606  36.7871
18    13713   11079       2634  36.9333
19    13791   11151       2640  37.0104


In [5]:
# Task 2.3: Summary statistics
print("\nTask 2.3: Statistical summary of the 4 features")

print("\nMean values:")
print(new_df.mean())

print("\nStandard deviation values:")
print(new_df.std())

print("\nQuartile values (25%, 50%, 75%):")
print(new_df.quantile([0.25, 0.5, 0.75]))

print("\nComplete statistical description:")
print(new_df.describe())


Task 2.3: Statistical summary of the 4 features

Mean values:
N_total      3476.786500
N_bulk       2521.550250
N_surface     955.236250
R_avg          20.654363
dtype: float64

Standard deviation values:
N_total      3679.286769
N_bulk       2976.232459
N_surface     721.870220
R_avg           7.610716
dtype: float64

Quartile values (25%, 50%, 75%):
      N_total   N_bulk  N_surface      R_avg
0.25   1061.0   618.75     437.00  15.160725
0.50   1867.0  1199.00     666.00  18.629250
0.75   4503.0  3183.00    1301.75  25.525125

Complete statistical description:
            N_total        N_bulk   N_surface        R_avg
count   4000.000000   4000.000000  4000.00000  4000.000000
mean    3476.786500   2521.550250   955.23625    20.654363
std     3679.286769   2976.232459   721.87022     7.610716
min      236.000000     89.000000   137.00000     8.528600
25%     1061.000000    618.750000   437.00000    15.160725
50%     1867.000000   1199.000000   666.00000    18.629250
75%     4503.0000

In [7]:
# Task 2.4: Histograms in 1x4 layout
plt.figure(figsize=(16, 4))
features = ["N_total", "N_bulk", "N_surface", "R_avg"]
for i, feature in enumerate(features, 1):
    plt.subplot(1, 4, i)
    plt.hist(new_df[feature], bins=30, alpha=0.7, edgecolor='black')
    plt.title(f'Histogram of {feature}')
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("task2_histograms.png", dpi=300, bbox_inches="tight")
plt.close()
print("\nTask 2.4: Histograms saved as 'task2_histograms.png'")


Task 2.4: Histograms saved as 'task2_histograms.png'


In [8]:
# Task 2.5: Pairplot
sns.pairplot(new_df)
plt.savefig("task2_pairplot.png", dpi=300, bbox_inches="tight")
plt.close()
print("\nTask 2.5: Pairplot saved as 'task2_pairplot.png'")



Task 2.5: Pairplot saved as 'task2_pairplot.png'


In [9]:
# Task 2.6: Modified PairGrid
print("\nTask 2.6: Creating modified PairGrid plot")

g = sns.PairGrid(new_df)
g.map_upper(sns.histplot)
g.map_diag(sns.histplot, kde=True)
g.map_lower(sns.kdeplot)

plt.savefig("task2_modified_pairplot.png", dpi=300, bbox_inches="tight")
plt.close()
print("Modified PairGrid plot saved as 'task2_modified_pairplot.png'")

print("\nTask 2 completed successfully!")
print("All plots have been saved as PNG files.")


Task 2.6: Creating modified PairGrid plot
Modified PairGrid plot saved as 'task2_modified_pairplot.png'

Task 2 completed successfully!
All plots have been saved as PNG files.
