# Assignment 3: Data Science and Big Data Analysis (COSC 5340)

#                           Richard Johnson L20455045 


"Airfoil Self-Noise Data Set"

Data Set Information:
    
The NASA data set comprises different size NACA 0012 airfoils at various wind tunnel speeds and angles of attack. The span of the airfoil and the observer position were the same in all of the experiments.



Attribute Information:

This problem has the following inputs:
1. Frequency, in Hertzs (Nominal attribute)
2. Angle of attack, in degrees (Nominal attribute)
3. Chord length, in meters (Nominal attribute)
4. Free-stream velocity, in meters per second (Nominal attribute)
5. Suction side displacement thickness, in meters (Nominal attribute)
The only output is:
6. Scaled sound pressure level, in decibels (Nominal attribute)


In [1]:
#Load necessary libraries
import numpy as np
import pandas as pd

In [13]:
#Load dataset
df = pd.read_csv("airfoil_self_noise.dat", delim_whitespace=True, header=None)

In [14]:
#Check top5 data
df.head()

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [15]:
#Setting the columns name from our dataset file

df.columns = ['frequency','aoa','chord_length','velocity','displacement','SPL']

In [16]:
#Data Information check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   frequency     1503 non-null   int64  
 1   aoa           1503 non-null   float64
 2   chord_length  1503 non-null   float64
 3   velocity      1503 non-null   float64
 4   displacement  1503 non-null   float64
 5   SPL           1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [17]:
#Check for null value
df.isnull().sum()

frequency       0
aoa             0
chord_length    0
velocity        0
displacement    0
SPL             0
dtype: int64

In [18]:
df.head()

Unnamed: 0,frequency,aoa,chord_length,velocity,displacement,SPL
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [19]:
df_norm = df.apply(lambda x: (x - x.min(axis=0) ) / (x.max(axis=0) - x.min(axis=0)))

In [21]:
df_norm.head()

Unnamed: 0,frequency,aoa,chord_length,velocity,displacement,SPL
0,0.030303,0.0,1.0,1.0,0.039005,0.606829
1,0.040404,0.0,1.0,1.0,0.039005,0.580238
2,0.05303,0.0,1.0,1.0,0.039005,0.600181
3,0.070707,0.0,1.0,1.0,0.039005,0.64379
4,0.090909,0.0,1.0,1.0,0.039005,0.640333


In [22]:
df_norm.to_csv("df_norm.csv", index=False, header=True)

# This dataset has no missing values and all attributes are numerical
    No preprocesssing techniques are required for this.
    Measures of statistical dispersion can be computed directly

In [7]:
#Description on data
df.describe()

Unnamed: 0,frequency,aoa,chord_length,velocity,displacement,SPL
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,2886.380572,6.782302,0.136548,50.860745,0.01114,124.835943
std,3152.573137,5.918128,0.093541,15.572784,0.01315,6.898657
min,200.0,0.0,0.0254,31.7,0.000401,103.38
25%,800.0,2.0,0.0508,39.6,0.002535,120.191
50%,1600.0,5.4,0.1016,39.6,0.004957,125.721
75%,4000.0,9.9,0.2286,71.3,0.015576,129.9955
max,20000.0,22.2,0.3048,71.3,0.058411,140.987


# Count of some catergorical attributes

In [16]:
df['frequency'].sort_values().value_counts()

2000     105
2500     104
1600     103
3150     103
4000     102
1250     100
1000      99
800       97
5000      95
6300      89
630       88
500       78
400       69
315       56
8000      52
250       42
10000     42
200       35
12500     25
16000     13
20000      6
Name: frequency, dtype: int64

In [8]:
df['aoa'].sort_values().value_counts()

0.0     329
4.0      93
15.4     65
7.3      64
12.3     64
9.9      64
17.4     63
3.0      59
9.5      56
2.0      56
5.4      55
4.8      50
3.3      50
8.4      43
12.7     34
7.2      33
6.7      33
12.6     32
8.9      32
1.5      31
2.7      30
5.3      30
22.2     30
15.6     30
11.2     28
19.7     28
4.2      21
Name: aoa, dtype: int64

In [18]:
df['chord_length'].sort_values().value_counts()

0.0254    278
0.1524    271
0.2286    266
0.1016    263
0.0508    237
0.3048    188
Name: chord_length, dtype: int64

In [19]:
df['velocity'].sort_values().value_counts()

39.6    480
71.3    465
31.7    281
55.5    277
Name: velocity, dtype: int64

In [20]:
df['displacement'].sort_values().value_counts()

0.005295    23
0.003101    19
0.004978    18
0.003313    18
0.013025    17
            ..
0.001428    10
0.005929     9
0.005781     8
0.005214     8
0.004783     8
Name: displacement, Length: 105, dtype: int64

In [21]:
df['SPL'].sort_values().value_counts()

126.540    3
129.395    3
127.315    3
125.586    2
130.777    2
          ..
129.516    1
116.560    1
122.435    1
125.194    1
122.539    1
Name: SPL, Length: 1456, dtype: int64

# Statistics on numerical attributes

In [22]:
num_attribute = ['frequency','aoa','chord_length','velocity','displacement', 'SPL']

In [23]:
#Using function to find mean,median and standard deviation
def statistics(x):
    mean = x.mean()
    median = x.median()
    std = x.std()
    f.write("\n\tMean: %f "%mean)
    f.write("\n\tMedian: %f"%median)
    f.write("\n\tStandard Deviation: %f\n"%std)

In [24]:
#Calculating the 1st quartile, 3rd quartile, range and variance
def stat(x):
    f.write("\n\tFirst Quartile: %f "%x.quantile(0.25))
    f.write("\n\tThird Quartile: %f "%x.quantile(0.75))
    f.write("\n\tRange: %f "%(x.max()-x.min()))
    f.write("\n\tVariance: %f\n"%x.var())

In [25]:
#Calculating the AAD and MAD
def func(x):
    s = []
    s.append(abs(x-x.mean()))
    f.write("\n\tAAD: %f "%(np.mean(s)))
    f.write("\n\tMAD: %f \n\n"%(np.median(s)))

In [26]:
#Calling all the methods and writing to a file 
f = open ("outputfile.txt","w+")
for i in range(len(num_attribute)):
    f.write("\nAttribute: "+num_attribute[i]+"\n")
    statistics(df[num_attribute[i]])
    stat(df[num_attribute[i]])
    func(df[num_attribute[i]])
f.close()