In [67]:
# Exercise 2 - - NUMPY and PANDAS FOR DATA ANALYSIS
# Gabriel Kret
# Due: 10/21/2024

#This exercise is about using numpy and pandas to analyze data.

#Part I

In [68]:
import numpy as np
import pandas as pd

In [69]:
def calculate_fluid_statistics(root_dir):
    """
    Calculate statistics for fluid experiments from CSV files.

    This function loads data from fluids.csv, experiments.csv, and fluid_measurements.csv, merges the data, and calculates mean, median, and standard deviation of pressure, velocity, temperature, and flow_rate for each unique fluid.

    Parameters:
    root_dir (str): The root directory containing the CSV files.

    Returns:
    np.array: A structured NumPy array containing the calculated statistics for each fluid.
        The array has the following fields:
        - fluid_id (int): The unique identifier for each fluid.
        - fluid_name (str): The name of the fluid.
        - pressure_mean, pressure_median, pressure_std (float): Statistics for pressure.
        - velocity_mean, velocity_median, velocity_std (float): Statistics for velocity.
        - temperature_mean, temperature_median, temperature_std (float): Statistics for temperature.
        - flow_rate_mean, flow_rate_median, flow_rate_std (float): Statistics for flow rate.
    """
    #read data
    fluids = pd.read_csv(root_dir + '/fluids.csv')
    experiments = pd.read_csv(root_dir + '/experiments.csv')
    fluid_measurements = pd.read_csv(root_dir + '/fluid_measurements.csv')
    
    #merge data
    data = pd.merge(fluid_measurements, experiments, on='experiment_id', how='left')
    data = pd.merge(data, fluids, on='fluid_id', how = 'left')

    #calculate statistics
    stats = data.groupby('fluid_name').agg({
        'pressure': ['mean', 'median', 'std'],
        'velocity': ['mean', 'median', 'std'],
        'temperature': ['mean', 'median', 'std'],
        'flow_rate': ['mean', 'median', 'std'],
    })

    #create array
    result_array = np.array(np.zeros(len(stats)), dtype=[('fluid_id', 'i4'), ('fluid_name', 'U50'), ('pressure_mean', 'f4'), ('pressure_median', 'f4'), ('pressure_std', 'f4'), ('velocity_mean', 'f4'), ('velocity_median', 'f4'), ('velocity_std', 'f4'), ('temperature_mean', 'f4'), ('temperature_median', 'f4'), ('temperature_std', 'f4'), ('flow_rate_mean', 'f4'), ('flow_rate_median', 'f4'), ('flow_rate_std', 'f4')])
    result_array['fluid_id'] = data.groupby('fluid_id')['fluid_id'].first().values
    result_array['fluid_name'] = data.groupby('fluid_id')['fluid_name'].first().values
    result_array['pressure_mean'] = stats['pressure']['mean'].values
    result_array['pressure_median'] = stats['pressure']['median'].values
    result_array['pressure_std'] = stats['pressure']['std'].values
    result_array['velocity_mean'] = stats['velocity']['mean'].values
    result_array['velocity_median'] = stats['velocity']['median'].values
    result_array['velocity_std'] = stats['velocity']['std'].values
    result_array['temperature_mean'] = stats['temperature']['mean'].values
    result_array['temperature_median'] = stats['temperature']['median'].values
    result_array['temperature_std'] = stats['temperature']['std'].values
    result_array['flow_rate_mean'] = stats['flow_rate']['mean'].values
    result_array['flow_rate_median'] = stats['flow_rate']['median'].values
    result_array['flow_rate_std'] = stats['flow_rate']['std'].values

    print(result_array)
      

# Call the function and print the results
result_array = calculate_fluid_statistics(root_dir='exercise_data') # change root_dir to where your data for this exercise is
print(result_array)




[( 1, 'Water', 5.2242642e+03, 5.345930e+03, 2.5031309e+03, 2.49     , 1.99 , 1.8665386 , 49.854286, 51.72 , 23.575453 , 1.9514285e+00,  1.56 , 1.4614768e+00)
 ( 2, 'Air', 7.2733335e+00, 8.050000e+00, 1.6667433e+00, 2.4566667, 1.48 , 2.2130146 , 56.45    , 57.61 ,  5.541814 , 3.3333334e-03,  0.   , 5.7735029e-03)
 ( 3, 'Oil', 1.8758334e+03, 7.946300e+02, 2.0750293e+03, 3.1733334, 4.3  , 2.2703598 , 45.77667 , 46.17 , 25.462278 , 2.7800000e+00,  3.77 , 1.9892461e+00)
 ( 6, 'Mercury', 3.0704099e+03, 1.724200e+03, 2.3592539e+03, 3.7733333, 3.49 , 1.0151026 , 75.56333 , 89.12 , 33.34987  , 3.0366666e+00,  2.81 , 8.1402296e-01)
 ( 7, 'Acetone', 6.0995609e+04, 6.099207e+04, 1.5003267e+04, 2.3575   , 2.79 , 1.157076  , 58.9325  , 53.525, 18.587267 , 3.1934999e+01, 37.79 , 1.5672393e+01)
 ( 8, 'Benzene', 4.7392998e+03, 4.509840e+03, 1.5979779e+03, 2.745    , 2.455, 1.218043  , 47.13375 , 31.215, 32.145294 , 2.3337500e+00,  2.09 , 1.0354977e+00)
 ( 9, 'Propylene Glycol', 4.9899565e+03, 4.203800e

In [70]:
def get_experiments_and_correlation(root_dir, fluid_id):
    """
    Retrieves experiment IDs for a given fluid and calculates the correlation matrix of measurements.

    Parameters:
    root_dir (str): The root directory containing the CSV files.
    fluid_id (int): The ID of the fluid to analyze.

    Returns:
    tuple: A tuple containing two elements:
        - numpy.ndarray: An array of experiment IDs associated with the given fluid_id.
        - pandas.DataFrame: A correlation matrix of pressure, velocity, temperature, and flow_rate for the experiments associated with the given fluid_id.
    """
    #read data
    fluids = pd.read_csv(root_dir + '/fluids.csv')
    experiments = pd.read_csv(root_dir + '/experiments.csv')
    fluid_measurements = pd.read_csv(root_dir + '/fluid_measurements.csv')

    #merge data
    data = pd.merge(fluid_measurements, experiments, on='experiment_id', how='left')
    data = pd.merge(data, fluids, on='fluid_id', how = 'left')

    #get experiment ids
    experiment_ids = data[data['fluid_id'] == fluid_id]['experiment_id'].values

    #calculate correlation matrix
    correlation_matrix = data[data['fluid_id'] == fluid_id][['pressure', 'velocity', 'temperature', 'flow_rate']].corr()

    return np.array(experiment_ids), correlation_matrix
    

# Call the function
root_dir = 'exercise_data' # change root_dir to where your data for this exercise is
fluid_id = 1
experiment_ids, correlation_matrix = get_experiments_and_correlation(root_dir, fluid_id)

# Print results
print(f"Experiment IDs for fluid_id {fluid_id}:")
print(experiment_ids)
print("\nCorrelation Matrix:")
print(correlation_matrix)

Experiment IDs for fluid_id 1:
[13  8]

Correlation Matrix:
             pressure  velocity  temperature  flow_rate
pressure          1.0       1.0         -1.0        1.0
velocity          1.0       1.0         -1.0        1.0
temperature      -1.0      -1.0          1.0       -1.0
flow_rate         1.0       1.0         -1.0        1.0


In [71]:

def create_normalized_fluid_matrix(root_dir):
    """
    Create a normalized 5x3 matrix of fluid properties.

    This function reads fluid data from a CSV file, selects the first 5 fluids, and creates a matrix of their density, viscosity, and specific heat properties. The matrix is then normalized using min-max normalization.

    Parameters:
    root_dir (str): The root directory containing the fluids.csv file.

    Returns:
    numpy.ndarray: A 5x3 normalized matrix where each row represents a fluid and each column represents a normalized property (density, viscosity, specific_heat).
    """
    fluids = pd.read_csv(root_dir + '/fluids.csv')
    fluids = fluids[['density', 'viscosity', 'specific_heat']]
    fluids = fluids.head(5)
    fluids = fluids.to_numpy()

    min_values = np.min(fluids, axis=0)
    max_values = np.max(fluids, axis=0)

    normalized_fluids = (fluids - min_values) / (max_values - min_values)

    return normalized_fluids

# Call the function
root_dir = 'exercise_data' # change root_dir to where your data for this exercise is
result_matrix = create_normalized_fluid_matrix(root_dir)

# Print the result
print("Normalized 5x3 Fluid Property Matrix:")
print(result_matrix)

# Print with column names for clarity
column_names = ['density', 'viscosity', 'specific_heat']
result_df = pd.DataFrame(result_matrix, columns=column_names)
print("\nNormalized Matrix with Column Names:")
print(result_df)

Normalized 5x3 Fluid Property Matrix:
[[7.93449981e-01 6.54607899e-04 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.74286509e-01 6.65468030e-03 3.12991507e-01]
 [6.25826697e-01 7.87942841e-04 4.38817238e-01]
 [1.00000000e+00 1.00000000e+00 4.38817238e-01]]

Normalized Matrix with Column Names:
    density  viscosity  specific_heat
0  0.793450   0.000655       1.000000
1  0.000000   0.000000       0.000000
2  0.674287   0.006655       0.312992
3  0.625827   0.000788       0.438817
4  1.000000   1.000000       0.438817


In [72]:
import pandas as pd
import numpy as np

def analyze_fluid_properties(root_dir):
    """
    Analyze fluid properties from a CSV file.

    This function reads fluid data from a CSV file, selects the first 5 fluids,
    normalizes their properties, calculates the correlation matrix, and performs
    eigenvalue decomposition.

    Parameters:
    root_dir (str): The root directory containing the fluids.csv file.

    Returns:
    tuple: A tuple containing:
        - normalized_matrix (numpy.ndarray): A 5x3 normalized matrix of fluid properties.
        - correlation_matrix (numpy.ndarray): The correlation matrix of the normalized data.
        - eigenvalues (numpy.ndarray): The eigenvalues of the correlation matrix.
        - eigenvectors (numpy.ndarray): The eigenvectors of the correlation matrix.
    """
    #load data
    fluids = pd.read_csv(root_dir + '/fluids.csv')
    fluids = fluids[['density', 'viscosity', 'specific_heat']]
    fluids = fluids.head(5)
    fluids = fluids.to_numpy()

    #normalize data
    min_values = np.min(fluids, axis=0)
    max_values = np.max(fluids, axis=0)

    normalized_matrix = (fluids - min_values) / (max_values - min_values)

    #calculate correlation matrix 
    correlation_matrix = np.corrcoef(normalized_matrix, rowvar=False)

    #eigenvalue decomposition
    eigenvalues, eigenvectors = np.linalg.eig(correlation_matrix)

    return normalized_matrix, correlation_matrix, eigenvalues, eigenvectors


# Call the function
root_dir = 'exercise_data' # change root_dir to where your data for this exercise is
normalized_matrix, correlation_matrix, eigenvalues, eigenvectors = analyze_fluid_properties(root_dir)

# Print results
print("Normalized 5x3 Fluid Property Matrix:")
print(normalized_matrix)

print("\nCorrelation Matrix:")
print(correlation_matrix)

print("\nEigenvalues:")
print(eigenvalues)

print("\nEigenvectors:")
print(eigenvectors)

# Interpret the results
total_variance = np.sum(eigenvalues)
explained_variance_ratio = eigenvalues / total_variance

print("\nExplained Variance Ratio:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"Principal Component {i+1}: {ratio:.4f}")

# Determine which property or combination explains the most variance
properties = ['density', 'viscosity', 'specific_heat']
max_component = np.argmax(np.abs(eigenvectors[:, 0]))
max_contribution = eigenvectors[max_component, 0]

print(f"\nThe property that contributes most to the first principal component is: {properties[max_component]}")
print(f"Its contribution is: {max_contribution:.4f}")

# If the contribution is not overwhelmingly large, print the combination
if max_contribution < 0.8:
    print("\nThe combination of properties that explains the most variance is:")
    for i, prop in enumerate(properties):
        print(f"{prop}: {eigenvectors[i, 0]:.4f}")

Normalized 5x3 Fluid Property Matrix:
[[7.93449981e-01 6.54607899e-04 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.74286509e-01 6.65468030e-03 3.12991507e-01]
 [6.25826697e-01 7.87942841e-04 4.38817238e-01]
 [1.00000000e+00 1.00000000e+00 4.38817238e-01]]

Correlation Matrix:
[[1.00000000e+00 5.70589790e-01 6.68644167e-01]
 [5.70589790e-01 1.00000000e+00 3.52574102e-04]
 [6.68644167e-01 3.52574102e-04 1.00000000e+00]]

Eigenvalues:
[1.87918363 0.12116456 0.99965181]

Eigenvectors:
[[ 7.07036753e-01  7.07176800e-01  6.30797704e-05]
 [ 4.59082256e-01 -4.58923489e-01 -7.60679114e-01]
 [ 5.37905672e-01 -5.37857049e-01  6.49128094e-01]]

Explained Variance Ratio:
Principal Component 1: 0.6264
Principal Component 2: 0.0404
Principal Component 3: 0.3332

The property that contributes most to the first principal component is: density
Its contribution is: 0.7070

The combination of properties that explains the most variance is:
density: 0.7070
viscosity: 0.4591
specific_h

In [73]:
import pandas as pd
import numpy as np

def cosine_similarity(v1, v2):
    #Calculate cosine similarity between two vectors
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def find_most_similar_fluids(root_dir):
    """
    Find the two most similar fluids based on their properties.

    This function loads fluid data from a CSV file, computes the cosine similarity between each pair of fluids based on their density, viscosity, and specific heat,  and returns the two most similar fluids along with their similarity score.

    Parameters:
    root_dir (str): The directory path where the fluids.csv file is located.

    Returns:
    tuple: A tuple containing three elements:
        - fluid1 (pandas.Series): The first fluid of the most similar pair.
        - fluid2 (pandas.Series): The second fluid of the most similar pair.
        - max_similarity (float): The cosine similarity between the two most similar fluids.
    """
    # Load fluid data
    fluids = pd.read_csv(root_dir + '/fluids.csv')

    # Select properties for comparison
    properties = ['density', 'viscosity', 'specific_heat']
    fluid_properties = fluids[properties].to_numpy()

    # Calculate cosine similarity between each pair of fluids
    similarities = []
    for i in range(len(fluids)):
        for j in range(i+1, len(fluids)):
            similarity = cosine_similarity(fluid_properties[i], fluid_properties[j])
            similarities.append((i, j, similarity))

    # Find the two most similar fluids
    max_similarity = max(similarities, key=lambda x: x[2])
    fluid1 = fluids.iloc[max_similarity[0]]
    fluid2 = fluids.iloc[max_similarity[1]]

    return fluid1, fluid2, max_similarity[2]

# Call the function
root_dir = 'exercise_data' # change root_dir to where your data for this exercise is
fluid1, fluid2, similarity = find_most_similar_fluids(root_dir)

# Print results
print(f"The two most similar fluids are:")
print(f"1. {fluid1['fluid_name']} (ID: {fluid1['fluid_id']})")
print(f"2. {fluid2['fluid_name']} (ID: {fluid2['fluid_id']})")
print(f"Cosine similarity: {similarity:.4f}")

print("\nTheir properties are:")
print(f"{'Property':<15} {'Fluid 1':<15} {'Fluid 2':<15}")
print("-" * 45)
for prop in ['density', 'viscosity', 'specific_heat']:
    print(f"{prop:<15} {fluid1[prop]:<15.4f} {fluid2[prop]:<15.4f}")

The two most similar fluids are:
1. Oil (ID: 3)
2. Propylene Glycol (ID: 9)
Cosine similarity: 0.9998

Their properties are:
Property        Fluid 1         Fluid 2        
---------------------------------------------
density         850.0000        1030.0000      
viscosity       10.0000         60.0000        
specific_heat   2000.0000       2500.0000      
