# Project 1: 

## Assignment Scenario 

You are a data scientist at NASA. You are given a NASA data set obtained from a series of aerodynamic and acoustic tests of two and three-dimensional airfoil blade sections conducted in an anechoic wind tunnel. The dataset contains the following features/predictors and label/target.
This dataset has the following features:

Frequency, in Hertz
The angle of attack, in degrees
Chord length, in meters
Free-stream velocity, in meters per second
Suction side displacement thickness, in meters
The only target/label is:

Scaled sound pressure level, in decibels
You are asked to perform the following tasks by using the attached Jupyter Notebook, writing a script in Python, and running it on all the cells. You only need to submit a JupyterNotebook.

In [1]:
# Import necessary libraries for data handling, preprocessing, model building, and evaluation

# Data handling and preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# TensorFlow for deep learning models
import tensorflow as tf
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# PyTorch for alternative deep learning model
import torch
import torch.nn as nn
import torch.optim as optim

# Plotting and visualization
import matplotlib.pyplot as plt

# Set random seed for reproducibility
import random
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)
random.seed(42)

# Display configuration for plots
%matplotlib inline
plt.style.use('seaborn')


  plt.style.use('seaborn')


### Question 1: Load the Dataset

In [2]:
df = pd.read_csv('/Users/matthewmoore/Downloads/airfoil_self_noise.dat', sep= '\t', header= None)

df.head(5)

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [3]:
df.columns = ['Frequency', 'Attack_Angle', 'Chord_Length', 'Free_Stream_Velocity', 'SSDT', 'SSP']

df.head()

Unnamed: 0,Frequency,Attack_Angle,Chord_Length,Free_Stream_Velocity,SSDT,SSP
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


### Question 2: Clean the Data and Check for Missing Values

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Frequency             1503 non-null   int64  
 1   Attack_Angle          1503 non-null   float64
 2   Chord_Length          1503 non-null   float64
 3   Free_Stream_Velocity  1503 non-null   float64
 4   SSDT                  1503 non-null   float64
 5   SSP                   1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [5]:
df.isnull().sum()

Frequency               0
Attack_Angle            0
Chord_Length            0
Free_Stream_Velocity    0
SSDT                    0
SSP                     0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,Frequency,Attack_Angle,Chord_Length,Free_Stream_Velocity,SSDT,SSP
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,2886.380572,6.782302,0.136548,50.860745,0.01114,124.835943
std,3152.573137,5.918128,0.093541,15.572784,0.01315,6.898657
min,200.0,0.0,0.0254,31.7,0.000401,103.38
25%,800.0,2.0,0.0508,39.6,0.002535,120.191
50%,1600.0,5.4,0.1016,39.6,0.004957,125.721
75%,4000.0,9.9,0.2286,71.3,0.015576,129.9955
max,20000.0,22.2,0.3048,71.3,0.058411,140.987


### Question 3: Split the Dataset and Build Simple LR

In [7]:
X = df.drop(columns = ['SSP'])

y = df[['SSP']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (1202, 5)
Testing set shape: (301, 5)


In [9]:
model = LinearRegression()

# Fit the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error (MSE) on the test data
mse = mean_squared_error(y_test, y_pred)

# Print the MSE to evaluate the model's performance
print(f"Mean Squared Error on Test Data: {mse}")

Mean Squared Error on Test Data: 22.128643318247285


### Question 4: Preprocessing the Data

In [10]:
# Initialize the MinMaxScaler to scale all features between [0, 1]
scaler = MinMaxScaler()

# Fit the scaler on the features (X) and transform the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled training data
print(X_train_scaled[:5])

[[0.04040404 0.37837838 0.09090909 1.         0.08437176]
 [0.09090909 0.3018018  0.27272727 1.         0.07554131]
 [0.01010101 0.55405405 0.27272727 0.6010101  0.62786123]
 [0.01010101 0.78378378 0.         1.         0.27069731]
 [0.01010101 0.42792793 0.         0.         0.07262615]]


### Question 5: Build a Deep Learning Regression Model w/ TensorFlow

In [11]:
model = Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # First hidden layer with 64 neurons
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')


In [12]:
# Train the model on the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print(f"Mean Squared Error on Test Data: {mse}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on Test Data: 697.8273903296347


In [13]:
# Print the MSE
print(f"Mean Squared Error on Test Data: {mse}")

Mean Squared Error on Test Data: 697.8273903296347


### Question 6: Can this Model be Improved?

In [14]:
model2 = Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # First hidden layer with 64 neurons
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model2.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

In [15]:
# Train the model on the training data
model2.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions on the test data
y_pred = model2.predict(X_test_scaled)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print(f"Mean Squared Error on Test Data: {mse}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on Test Data: 366.901510965841


In [16]:
print(f"Mean Squared Error on Test Data: {mse}")

Mean Squared Error on Test Data: 366.901510965841


### Question 7: Build A Deep Learning Model w/ PyTorch

In [17]:
class LinearModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(LinearModel, self).__init__()
        self.hidden = nn.Linear(in_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        x = torch.relu(self.hidden(x))
        out = self.linear(x)
        return out

model3 = LinearModel(in_dim = 5, hidden_dim = 32, out_dim = 1)

In [18]:
learnRate = 0.001
optimizer = torch.optim.Adam(model3.parameters(), lr=learnRate)
criterion = nn.MSELoss()

In [19]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)  # (1202, 5)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)  # (1202, 1)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

In [20]:
epochs = 1000
for epoch in range(epochs):
    epoch += 100
    inputs = X_train_tensor
    labels = y_train_tensor
    out = model3(inputs)
    optimizer.zero_grad()
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    predicted = model3.forward(X_train_tensor)
    print('Epoch [{}], Loss [{}]'.format(epoch + 1, loss.item()))

Epoch [101], Loss [13716.1787109375]
Epoch [102], Loss [10755.0556640625]
Epoch [103], Loss [8676.005859375]
Epoch [104], Loss [7451.0556640625]
Epoch [105], Loss [6988.34033203125]
Epoch [106], Loss [7103.296875]
Epoch [107], Loss [7526.44921875]
Epoch [108], Loss [7975.70068359375]
Epoch [109], Loss [8254.009765625]
Epoch [110], Loss [8291.3857421875]
Epoch [111], Loss [8117.79345703125]
Epoch [112], Loss [7813.6904296875]
Epoch [113], Loss [7470.84033203125]
Epoch [114], Loss [7168.529296875]
Epoch [115], Loss [6960.51416015625]
Epoch [116], Loss [6868.9765625]
Epoch [117], Loss [6884.5068359375]
Epoch [118], Loss [6972.52490234375]
Epoch [119], Loss [7085.5498046875]
Epoch [120], Loss [7178.10400390625]
Epoch [121], Loss [7219.185546875]
Epoch [122], Loss [7198.31591796875]
Epoch [123], Loss [7124.40673828125]
Epoch [124], Loss [7019.4931640625]
Epoch [125], Loss [6910.43603515625]
Epoch [126], Loss [6821.02734375]
Epoch [127], Loss [6766.23046875]
Epoch [128], Loss [6749.437011718

In [21]:
# Evaluate the model on test data
model3.eval()
with torch.no_grad():
    y_pred_test = model3(X_test_tensor)
mse_test = mean_squared_error(y_test, y_pred_test.numpy())
print(f"Test MSE for two-layer model: {mse_test}")

Test MSE for two-layer model: 1236.8960951191564


### Question 8: Can This Model Be Improved?

In [22]:
# Initialize the model with the correct input, hidden, and output dimensions
model4 = LinearModel(in_dim=5, hidden_dim=128, out_dim=1)

optimizer2 = torch.optim.Adam(model4.parameters(), lr=0.001)

In [23]:
epochs = 1000
for epoch in range(epochs):
    epoch += 100
    inputs = X_train_tensor
    labels = y_train_tensor
    out = model4(inputs)
    optimizer2.zero_grad()
    loss = criterion(out, labels)
    loss.backward()
    optimizer2.step()
    predicted = model4.forward(X_train_tensor)
    print('Epoch [{}], Loss [{}]'.format(epoch + 1, loss.item()))

Epoch [101], Loss [109918.2265625]
Epoch [102], Loss [70409.8125]
Epoch [103], Loss [40881.01171875]
Epoch [104], Loss [21187.294921875]
Epoch [105], Loss [10676.8154296875]
Epoch [106], Loss [7958.3828125]
Epoch [107], Loss [10789.35546875]
Epoch [108], Loss [16330.8115234375]
Epoch [109], Loss [21844.8046875]
Epoch [110], Loss [25444.875]
Epoch [111], Loss [26389.6796875]
Epoch [112], Loss [24877.26953125]
Epoch [113], Loss [21646.353515625]
Epoch [114], Loss [17624.2734375]
Epoch [115], Loss [13687.7109375]
Epoch [116], Loss [10518.013671875]
Epoch [117], Loss [8522.4248046875]
Epoch [118], Loss [7806.93115234375]
Epoch [119], Loss [8198.876953125]
Epoch [120], Loss [9319.2158203125]
Epoch [121], Loss [10693.654296875]
Epoch [122], Loss [11876.177734375]
Epoch [123], Loss [12550.2890625]
Epoch [124], Loss [12581.7890625]
Epoch [125], Loss [12016.9033203125]
Epoch [126], Loss [11037.7275390625]
Epoch [127], Loss [9895.4755859375]
Epoch [128], Loss [8841.162109375]
Epoch [129], Loss [

In [24]:
# Evaluate the model on test data
model4.eval()
with torch.no_grad():
    y_pred_test = model4(X_test_tensor)
mse_test = mean_squared_error(y_test, y_pred_test.numpy())
print(f"Test MSE for two-layer model: {mse_test}")

Test MSE for two-layer model: 1232.2823729348224
