In [1]:
import pandas
df = pandas.read_csv('AirfoilSelfNoise.csv')

In [2]:
df.head

<bound method NDFrame.head of          f  alpha       c  U_infinity     delta     SSPL
0      800    0.0  0.3048        71.3  0.002663  126.201
1     1000    0.0  0.3048        71.3  0.002663  125.201
2     1250    0.0  0.3048        71.3  0.002663  125.951
3     1600    0.0  0.3048        71.3  0.002663  127.591
4     2000    0.0  0.3048        71.3  0.002663  127.461
...    ...    ...     ...         ...       ...      ...
1498  2500   15.6  0.1016        39.6  0.052849  110.264
1499  3150   15.6  0.1016        39.6  0.052849  109.254
1500  4000   15.6  0.1016        39.6  0.052849  106.604
1501  5000   15.6  0.1016        39.6  0.052849  106.224
1502  6300   15.6  0.1016        39.6  0.052849  104.204

[1503 rows x 6 columns]>

In [None]:
''' INFORMATION ABOUT THE DATASET  '''

# aim : predict decibel value(SSPL) based on given features (to know how much noise it makes)

#                      INFO ABOUT THE FEATURES

# Feature          Unit              Technical Significance
# Frequency         Hz      "The ""pitch"" of the noise. High frequency = shrill; Low = rumble."
# Angle of Attack   Â°        How much the wing is tilted upward. More tilt = more turbulence.
# Chord Length    Meters     The width of the wing from front to back.
# Velocity          m/s     How fast the air is moving over the wing.
# Displacement    Meters    "The thickness of the ""messy"" air layer at the edge of the wing."

In [3]:
df.isna().sum()

f             0
alpha         0
c             0
U_infinity    0
delta         0
SSPL          0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('SSPL',axis=1)
y = df['SSPL']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
from sklearn.ensemble import RandomForestRegressor

rfr =  RandomForestRegressor(random_state=42)

rfr.fit(X_train,y_train)

y_pred = rfr.predict(X_test)

In [None]:
''' THE SCORE BEFORE OPTIMIZATION '''

In [16]:
from sklearn.metrics import r2_score

print(r2_score(y_test,y_pred))

0.9345545578196172


In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_random_search = {
    'n_estimators': randint(100, 1000), 
    'min_samples_split': randint(2, 10), 
    'max_depth': [None, 10, 20, 30, 40, 50], 
    'max_features': ['sqrt', 'log2', None] 
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_random_search,       
    cv=10,
    n_iter=20,
    scoring='r2'   
)

random_search.fit(X_train, y_train)

print(f"Best Score: {random_search.best_score_}")
print(f"Best Settings: {random_search.best_params_}")

Best Score: 0.9056345864887095
Best Settings: {'max_depth': 30, 'max_features': None, 'min_samples_split': 3, 'n_estimators': 774}


In [None]:
''' THE SCORE AFTER OPTIMIZATION '''

In [29]:
best_model = random_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print("Tuned Model R2 Score:", r2_score(y_test, y_pred_best))

Tuned Model R2 Score: 0.9097585541546571


In [20]:
import pandas as pd

importances = pd.Series(rfr.feature_importances_, index=X.columns)

importances = importances.sort_values(ascending=False)
print(importances)

f             0.416394
delta         0.406471
c             0.093132
alpha         0.042195
U_infinity    0.041809
dtype: float64


In [None]:
''' AFTER FEATURE IMPORTANCE'''

In [21]:
X = df.drop(['SSPL','U_infinity','alpha'],axis=1)
y = df['SSPL']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [22]:
rfr =  RandomForestRegressor(random_state=42)

rfr.fit(X_train,y_train)

y_pred = rfr.predict(X_test)

In [23]:
from sklearn.metrics import r2_score

print(r2_score(y_test,y_pred))

0.9135050632813029


In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_random_search = {
    'n_estimators': randint(100, 1000), 
    'min_samples_split': randint(2, 10), 
    'max_depth': [None, 10, 20, 30, 40, 50], 
    'max_features': ['sqrt', 'log2', None] 
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_random_search,       
    cv=10,
    n_iter=40,
    scoring='r2'   
)

random_search.fit(X_train, y_train)

print(f"Best Score: {random_search.best_score_}")
print(f"Best Settings: {random_search.best_params_}")

Best Score: 0.9054526195832203
Best Settings: {'max_depth': 20, 'max_features': None, 'min_samples_split': 3, 'n_estimators': 654}


In [31]:
best_model = random_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print("Tuned Model R2 Score:", r2_score(y_test, y_pred_best))

Tuned Model R2 Score: 0.9094859924723471


In [None]:
'''
AS WE CAN SEE, IT DECREASED AFTER REMOVING  'U_infinity' AND 'alpha' , SO THEY WERE INDEED IMPORTANT.
'''