<a href="https://colab.research.google.com/github/dhairyaostwal/NLP-Tutorial/blob/main/RegressionBasics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Basics

## Regression

In [1]:
import numpy as np
import pandas as pd

Dataset Source: https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

In [2]:
df = pd.read_csv('../airfoil_self_noise.dat', sep='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1503 non-null   int64  
 1   1       1503 non-null   float64
 2   2       1503 non-null   float64
 3   3       1503 non-null   float64
 4   4       1503 non-null   float64
 5   5       1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [5]:
df.keys()

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [7]:
data = df[[0,1,2,3,4]].values
target = df[5].values

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33)

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.525752750841504
0.4921114122151647


In [12]:
predictions = model.predict(X_test)

In [13]:
predictions

array([118.55862517, 131.55764185, 132.23386131, 129.13281431,
       119.04803422, 120.27369873, 129.43601527, 120.90179408,
       123.66120083, 130.57158645, 131.95789351, 129.59906203,
       129.76767207, 126.47069859, 123.09009439, 124.63372525,
       125.51492193, 126.49999004, 125.89300234, 131.4466095 ,
       124.7290947 , 120.1701734 , 125.93852705, 121.1520236 ,
       136.93756423, 133.5384559 , 118.1483743 , 120.20303958,
       125.71268207, 120.53657134, 127.89603994, 132.01502641,
       130.54376715, 121.91717379, 126.34966253, 121.13852034,
       130.96848721, 120.12877517, 124.43423667, 129.22011271,
       129.60258368, 116.38360107, 118.44432176, 128.7356691 ,
       113.72109727, 126.22122339, 133.4309254 , 132.30657754,
       129.88175   , 123.60951923, 118.04164532, 131.75076735,
       121.93122353, 125.97200286, 131.26133689, 126.27194058,
       121.08888848, 124.83524736, 124.46154291, 126.74919073,
       132.43626157, 127.14723889, 118.31465184, 126.43

In [14]:
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor()
model2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [15]:
print(model2.score(X_train, y_train))
print(model2.score(X_test, y_test))

0.9906934037066669
0.910246606627333


In [16]:
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.transform(X_test)
scaler2 = StandardScaler()
y_train2 = scaler2.fit_transform(np.expand_dims(y_train, -1)).ravel()
y_test2 = scaler2.fit_transform(np.expand_dims(y_test, -1)).ravel()


In [17]:
model = MLPRegressor(max_iter=500)
model.fit(X_train2, y_train2)

print(model.score(X_train2, y_train2))
print(model.score(X_test2, y_test2))

0.8515931326339969
0.7836947440888313
