In [13]:
import pandas as pd 
import numpy as np 
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

Combine original and generated data

In [2]:
imp_df = pd.read_csv('cues-hackathon-2021/data_imp.csv', usecols=[1,2,3,4,5,6,7,17,18])
gen_df = pd.read_csv('cues-hackathon-2021/generated_data.csv')
comb_df = pd.concat([imp_df, gen_df])
comb_df.columns = ['a1', 'turbine_rating', 'blade_length', 'tower_height', 
                'built_duration', 'metocean', 'water_depth', 'nac_weight', 'blade_weight']
comb_df.head()

Unnamed: 0,a1,turbine_rating,blade_length,tower_height,built_duration,metocean,water_depth,nac_weight,blade_weight
0,250.0,4000.0,69.0,207.0,36.0,1.0,2.0,90.0,22.0
1,250.0,4000.0,69.0,207.0,31.0,1.0,2.0,90.0,22.0
2,250.0,450.0,61.2,173.1,44.0,1.0,2.0,68.0,21.5
3,240.0,2300.0,70.5,201.0,40.333333,1.0,2.0,66.0,20.0
4,240.0,2450.0,100.5,270.0,42.0,1.0,2.0,105.0,17.7


In [3]:
comb_df['water_depth'] = np.round(comb_df['water_depth'])
comb_df['metocean'] = np.round(comb_df['metocean'])
comb_df.tail()

Unnamed: 0,a1,turbine_rating,blade_length,tower_height,built_duration,metocean,water_depth,nac_weight,blade_weight
95,111.123683,563.630343,21.863708,55.458928,33.525362,0.0,0.0,88.61536,10.073064
96,136.474203,2601.437891,50.914263,144.238504,37.043075,1.0,2.0,35.029473,22.126964
97,100.71041,2172.636622,60.157801,172.990414,30.298792,1.0,1.0,19.449733,17.421944
98,54.239177,1283.943303,20.566974,68.939863,42.588421,0.0,0.0,92.303003,8.497544
99,91.506799,245.705793,25.732752,73.164315,31.967963,1.0,0.0,23.543906,10.214273


In [4]:
nacelle_test = comb_df[comb_df['nac_weight'].isna()]
nacelle_train = comb_df[comb_df['nac_weight'].notnull()]

blade_test = comb_df[comb_df['blade_weight'].isna()]
blade_train = comb_df[comb_df['blade_weight'].notnull()]

In [5]:
# nacelle_test.to_csv('gen_data/nacelle_test.csv', index=False)
# nacelle_train.to_csv('gen_data/nacelle_train.csv', index=False)

# blade_test.to_csv('gen_data/blade_test.csv', index=False)
# blade_train.to_csv('gen_data/blade_train.csv', index=False)

In [6]:
blade_test

Unnamed: 0,a1,turbine_rating,blade_length,tower_height,built_duration,metocean,water_depth,nac_weight,blade_weight
5,232.0,1400.0,42.0,126.0,36.0,1.0,1.0,120.0,
6,225.0,1250.0,38.5,115.5,36.0,0.0,0.0,30.0,
21,150.0,1700.0,27.6,82.8,31.333333,1.0,2.0,,
29,140.0,1140.0,30.0,90.0,37.0,0.0,0.0,51.0,


In [27]:
# get confidence interval (should technically use the t-dist one)

def get_prediction_interval(pred_val, y, predictions, pi=.95):
    """
    Get a prediction interval for a linear regression.
    Args: 
        - Single prediction, 
        - y_test
        - All predictions,
        - Prediction interval threshold (default = .95) 
    OUTPUT: 
        - Prediction interval for single prediction
    """
    
    #get standard deviation of y
    sum_errs = np.sum((y - predictions)**2)
    stdev = np.sqrt(1 / (len(y) - 2) * sum_errs)
    #get interval from standard deviation
    one_minus_pi = 1 - pi
    ppf_lookup = 1 - (one_minus_pi / 2)
    z_score = stats.norm.ppf(ppf_lookup)
    interval = z_score * stdev
    #generate prediction interval lower and upper bound
    lower, upper = pred_val - interval, pred_val + interval
    return lower, pred_val, upper

In [32]:
nac_y_train = nacelle_train['nac_weight']
nac_X_train = nacelle_train.drop(columns=['nac_weight', 'blade_weight', 'tower_height'])

nac_y_test = nacelle_test['nac_weight']
nac_X_test = nacelle_test.drop(columns=['nac_weight', 'blade_weight', 'tower_height'])

Linear Regression model to predict nacelle weight

In [33]:
linreg = LinearRegression()
linreg.fit(nac_X_train, nac_y_train)

print("R2 score")
print(linreg.score(nac_X_train, nac_y_train))

nac_x_train_pred = linreg.predict(nac_X_train)

print("train MAE")
print(mean_absolute_error(nac_x_train_pred, nac_y_train))

print("train RMSE")
print(np.sqrt(mean_squared_error(nac_x_train_pred, nac_y_train)))

nac_x_test_pred = linreg.predict(nac_X_test)
results = []
for nac_pred in nac_x_test_pred:
    results.append(get_prediction_interval(nac_pred, nac_y_train, nac_x_train_pred))
results

R2 score
0.6159815732067264
train MAE
16.868740656981874
train RMSE
21.4769463946805


[(37.3101219441635, 79.68762813690631, 122.06513432964913),
 (21.327409791076825, 63.70491598381964, 106.08242217656246),
 (17.22978866473121, 59.607294857474024, 101.98480105021684),
 (51.23688363936584, 93.61438983210866, 135.99189602485148),
 (21.659072037953464, 64.03657823069628, 106.41408442343909),
 (42.4599822346443, 84.83748842738711, 127.21499462012993),
 (17.947429812585575, 60.32493600532839, 102.7024421980712),
 (-17.615361903990486, 24.762144288752328, 67.13965048149514),
 (-18.380775931733673, 23.99673026100914, 66.37423645375196),
 (8.258994827028658, 50.63650101977147, 93.01400721251429),
 (-15.981388183433257, 26.396118009309557, 68.77362420205237),
 (-27.698446583999463, 14.679059608743351, 57.056565801486165),
 (-18.934997765594044, 23.44250842714877, 65.82001461989158),
 (-30.82991627959611, 11.547589913146705, 53.92509610588952),
 (-36.10873336792231, 6.2687728248205055, 48.646279017563316),
 (-4.596850756226502, 37.78065543651631, 80.15816162925913)]

In [34]:
blade_y_train = blade_train['blade_weight']
blade_X_train = blade_train.drop(columns=['nac_weight', 'blade_weight', 'tower_height'])

blade_y_test = blade_test['blade_weight']
blade_X_test = blade_test.drop(columns=['nac_weight', 'blade_weight', 'tower_height'])


Linear Regression model to predict blade weight

In [35]:
linreg = LinearRegression()
linreg.fit(blade_X_train, blade_y_train)

print("R2 score")
print(linreg.score(blade_X_train, blade_y_train))

blade_x_train_pred = linreg.predict(blade_X_train)

print("train MAE")
print(mean_absolute_error(blade_x_train_pred, blade_y_train))

print("train RMSE")
print(np.sqrt(mean_squared_error(blade_x_train_pred, blade_y_train)))

blade_x_test_pred = linreg.predict(blade_X_test)
blade_x_test_pred

results = []
for blade_pred in blade_x_test_pred:
    results.append(get_prediction_interval(blade_pred, blade_y_train, blade_x_train_pred))
results

R2 score
0.6003485688857237
train MAE
3.101988394504575
train RMSE
4.358909811200159


[(10.890190916820513, 19.486726995999554, 28.083263075178593),
 (5.164531654924977, 13.761067734104017, 22.357603813283056),
 (11.262896653118359, 19.8594327322974, 28.45596881147644),
 (2.264041432417649, 10.86057751159669, 19.45711359077573)]

In [11]:
mean_blade = blade_y_train.mean()
mean_blade = [mean_blade]*len(blade_y_train)
mean_absolute_error(mean_blade, blade_x_train_pred)

4.8936937183978495

In [12]:
mean_nac = nac_y_train.mean()
mean_nac = [mean_nac]*len(nac_y_train)
mean_absolute_error(mean_nac, nac_x_train_pred)

22.3086233195985