In [1]:
import numpy as np
import pandas as pd
import mysql.connector

In [2]:
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='1408',
    database='cts'
)
cursor = conn.cursor()

cursor.execute("SELECT * FROM train_data")
train_results = cursor.fetchall()


cursor.execute("DESCRIBE train_data")
train_columns = [column[0] for column in cursor.fetchall()]

df_train = pd.DataFrame(train_results, columns=train_columns)


cursor.execute("SELECT * FROM test_data")
test_results = cursor.fetchall()


cursor.execute("DESCRIBE test_data")
test_columns = [column[0] for column in cursor.fetchall()]


df_test = pd.DataFrame(test_results, columns=test_columns)

cursor.close()
conn.close()

In [3]:
print(f"Training data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")

Training data shape: (8000, 65)
Test data shape: (500, 56)


In [4]:
print("Missing values in training data:", df_train.isnull().sum().sum())
print("Missing values in test data:", df_test.isnull().sum().sum())

Missing values in training data: 0
Missing values in test data: 0


In [7]:
def cap_outliers_iqr(df):
    
    df_capped = df.copy()
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df_capped[column] = np.where(df[column] < lower_bound, lower_bound, df_capped[column])
        df_capped[column] = np.where(df_capped[column] > upper_bound, upper_bound, df_capped[column])
    
    return df_capped

cap_outliers_iqr(df_test)

Unnamed: 0,ID,Component1_fraction,Component2_fraction,Component3_fraction,Component4_fraction,Component5_fraction,Component1_Property1,Component2_Property1,Component3_Property1,Component4_Property1,...,Component1_Property9,Component2_Property9,Component3_Property9,Component4_Property9,Component5_Property9,Component1_Property10,Component2_Property10,Component3_Property10,Component4_Property10,Component5_Property10
0,1.0,0.18,0.05,0.32,0.37,0.08,-0.177804,-0.741219,0.769821,-0.877069,...,-0.265376,0.123432,0.028533,-0.173365,1.297920,0.323299,-0.315146,0.625518,-0.514342,-0.777057
1,2.0,0.00,0.50,0.00,0.37,0.13,2.501350,0.177344,-0.498739,-0.196742,...,-0.787677,-0.757905,-0.280561,-1.965970,0.543475,-0.906851,0.962341,-0.183757,0.310871,-1.329040
2,3.0,0.16,0.00,0.17,0.50,0.17,1.547320,0.891479,0.030627,-0.368678,...,-0.710026,-1.422690,0.874071,-1.016140,0.093525,1.048530,-1.321850,0.356640,-0.869543,-0.177255
3,4.0,0.50,0.00,0.17,0.16,0.17,-0.424427,1.016860,-1.182980,-0.854225,...,-0.551366,0.257105,-0.077337,-0.721031,-0.760365,-0.507690,1.346560,-0.001529,-1.008450,1.726110
4,5.0,0.00,0.00,0.50,0.50,0.00,-0.187062,-0.762173,-0.473660,2.074090,...,-1.811470,-0.181223,-0.475933,0.234775,-0.909020,1.238200,-1.805660,0.980417,-1.354930,-0.657513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496.0,0.44,0.01,0.08,0.41,0.06,1.036800,1.415670,0.793302,-0.446630,...,-2.653870,-0.231333,-1.365940,-0.853755,-1.344680,-1.947530,-1.074580,-0.421069,-0.603527,0.838356
496,497.0,0.19,0.47,0.03,0.23,0.08,-1.305140,-1.520940,-0.989537,0.903203,...,-0.458610,-1.422690,1.952540,-0.283461,0.367323,2.689270,1.698610,-0.328886,-0.879281,-1.658220
497,498.0,0.43,0.01,0.12,0.21,0.23,0.806590,0.607324,0.359058,0.283394,...,-0.437798,-0.810355,-0.926471,-1.480680,-1.613630,1.016880,0.989316,0.408454,-0.925924,-0.022020
498,499.0,0.03,0.04,0.42,0.42,0.09,-0.792140,0.674275,-1.783490,0.848296,...,-2.507530,-0.080589,0.295430,-0.278889,-1.912660,0.090336,2.285820,0.793409,0.753718,-0.775325


In [10]:
fraction_cols = [col for col in df_train.columns if 'fraction' in col]
property_cols = [col for col in df_train.columns if 'Property' in col]
target_cols = [col for col in df_train.columns if 'BlendProperty' in col]

engineered_features_df = pd.DataFrame()

In [None]:

for i in range(1, 11):
    prop_number = str(i)
    
 
    weighted_avg = pd.Series([0.0] * len(df_train), index=df_train.index)
    
  
    for j in range(1, 6):
        comp_number = str(j)
        
       
        fraction_col = f'Component{comp_number}_fraction'
        property_col = f'Component{comp_number}_Property{prop_number}'
        
        
        if fraction_col in df_train.columns and property_col in df_train.columns:
            weighted_avg += df_train[fraction_col] * df_train[property_col]
        
    #
    new_feature_name = f'WeightedAvg_Property{prop_number}'
    engineered_features_df[new_feature_name] = weighted_avg

['BlendProperty1',
 'BlendProperty2',
 'BlendProperty3',
 'BlendProperty4',
 'BlendProperty5',
 'BlendProperty6',
 'BlendProperty7',
 'BlendProperty8',
 'BlendProperty9',
 'BlendProperty10']

In [18]:
X_train = pd.concat([df_train[fraction_cols], engineered_features_df], axis=1)
y_train = df_train[target_cols]

In [19]:
from sklearn.model_selection import train_test_split


X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=42
)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize the model (using your original parameters)
base_model = RandomForestRegressor(random_state=42, n_estimators=50)
model = MultiOutputRegressor(base_model)

# Train the model with the new, engineered features
model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
rmse_val = np.sqrt(mse)

print(f"Validation Root Mean Squared Error: {rmse_val:.4f}")
print(f"Validation R² Score: {r2:.4f}")

Validation Root Mean Squared Error: 0.0982
Validation R² Score: 0.9905


In [21]:
import pandas as pd
import numpy as np

# This code assumes you have a trained model named 'model' and the original
# df_train DataFrame from a previous step in your notebook.

# IMPORTANT: This list is the first row of your dataset.
# The order is 5 Component_fraction values, followed by 50 Component_Property values.
manual_input_values = [
    0.21, 0.0, 0.42, 0.25, 0.12,
    -0.0217822396792892, 1.981250612007177, 0.0200356225325539, 0.1403151240546649, 1.0320288631056806,
    -1.2297994570374926, -0.5802742850352095, 0.1339980409881218, 0.8178351791469262, 0.2161163880813474,
    -0.3933220542636423, 0.2211892949766066, 0.6561035770666948, 0.0744610778539929, -3.082937535673653,
    -1.763084912416028, 0.9845148240213166, -1.548114528493192, -1.654289586844628, -1.4100492612241635,
    0.0517794851298262, 1.0058244730685235, -0.4308684687946812, 1.7436077217701325, 0.2967128051436503,
    -1.4833331178815432, -1.7483585370025725, 1.503443262771482, 0.0230426400472773, 1.741302611356342,
    -0.5093797968821361, -0.4687331134214367, -1.31767543026822, 0.1321549212547532, 0.2212368959470617,
    0.2938149021199543, -0.1157528896007363, -0.9179915882721276, -0.274703522878247, 0.5133255022814384,
    0.4803682412455972, 1.04496703449017, -0.4509558120709413, 0.6745717934448435, -0.636394315804448,
    -1.244962923296729, -1.355050374548414, -0.3144230915121407, 0.9935934282505182, -2.7289283361006493
]

# Create a DataFrame from the manual input
manual_input_df = pd.DataFrame([manual_input_values], columns=df_train.columns[:55])

# Apply feature engineering to the manual input
fraction_cols = [col for col in manual_input_df.columns if 'fraction' in col]
property_cols = [col for col in manual_input_df.columns if 'Property' in col]
engineered_features_input = pd.DataFrame()

for i in range(1, 11):
    prop_number = str(i)
    weighted_avg = pd.Series([0.0] * len(manual_input_df), index=manual_input_df.index)
    
    for j in range(1, 6):
        comp_number = str(j)
        fraction_col = f'Component{comp_number}_fraction'
        property_col = f'Component{comp_number}_Property{prop_number}'
        
        if fraction_col in manual_input_df.columns and property_col in manual_input_df.columns:
            weighted_avg += manual_input_df[fraction_col] * manual_input_df[property_col]
            
    new_feature_name = f'WeightedAvg_Property{prop_number}'
    engineered_features_input[new_feature_name] = weighted_avg

# Combine original fraction features with new engineered features for prediction
X_manual = pd.concat([manual_input_df[fraction_cols], engineered_features_input], axis=1)

# Make a prediction
prediction = model.predict(X_manual)

# Display the prediction
print("\nPredicted Blend Properties:")
print(prediction)


Predicted Blend Properties:
[[ 0.489143  0.607589  0.32167  -1.23605   1.60113   1.38466   0.30585
   0.19346   0.580374 -0.762738]]


In [None]:
0.489143253	0.607588533	 0.321670368	 -1.236054697	1.601132097	1.384662362	0.305849575	0.19345994	0.580374249	-0.762737568
  

In [22]:
import joblib
joblib.dump(model, "random_forest_multioutput.pkl")

['random_forest_multioutput.pkl']

In [None]:
import joblib
import pandas as pd
import numpy as np


try:
    model = joblib.load('random_forest_multioutput.pkl')
    print("Model loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'random_forest_multioutput.pkl' was not found.")
    exit()

try:
    df_test = pd.read_csv('test.csv')
    print("Test data loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'test.csv' was not found.")
    exit()


if 'ID' in df_test.columns:
    test_ids = df_test['ID']
    X_test = df_test.drop('ID', axis=1)
else:
    X_test = df_test
    test_ids = None

fraction_cols = [col for col in X_test.columns if 'fraction' in col]
property_cols = [col for col in X_test.columns if 'Property' in col]

engineered_features_test_df = pd.DataFrame()

for i in range(1, 11):
    prop_number = str(i)
    weighted_avg = pd.Series([0.0] * len(X_test), index=X_test.index)
    
    for j in range(1, 6):
        comp_number = str(j)
        fraction_col = f'Component{comp_number}_fraction'
        property_col = f'Component{comp_number}_Property{prop_number}'
        
        if fraction_col in X_test.columns and property_col in X_test.columns:
            weighted_avg += X_test[fraction_col] * X_test[property_col]
            
    new_feature_name = f'WeightedAvg_Property{prop_number}'
    engineered_features_test_df[new_feature_name] = weighted_avg

X_test_engineered = pd.concat([X_test[fraction_cols], engineered_features_test_df], axis=1)

print("\nFeature engineering on test data complete.")
print("The new features are:", list(engineered_features_test_df.columns))


# Step 4: Make predictions using the loaded model
y_test_pred = model.predict(X_test_engineered)
print("\nPredictions generated successfully.")

# Step 5: Save predictions to a CSV file
# The columns are named based on your training data target columns
# We'll need to use the original y_train columns if available, but for a standalone script,
# we can assume the target column names.
target_columns = [f'BlendProperty{i}' for i in range(1, 11)]
predictions_df = pd.DataFrame(y_test_pred, columns=target_columns)

if test_ids is not None:
    predictions_df.insert(0, 'ID', test_ids)

predictions_df.to_csv('fuel_blend_predictions.csv', index=False)

print("\nPredictions saved to 'fuel_blend_predictions.csv'.")
print("First 5 predictions:")
print(predictions_df.head())

Model loaded successfully.
Test data loaded successfully.

Feature engineering on test data complete.
The new features are: ['WeightedAvg_Property1', 'WeightedAvg_Property2', 'WeightedAvg_Property3', 'WeightedAvg_Property4', 'WeightedAvg_Property5', 'WeightedAvg_Property6', 'WeightedAvg_Property7', 'WeightedAvg_Property8', 'WeightedAvg_Property9', 'WeightedAvg_Property10']

Predictions generated successfully.

Predictions saved to 'fuel_blend_predictions.csv'.
First 5 predictions:
   ID  BlendProperty1  BlendProperty2  BlendProperty3  BlendProperty4  \
0   1       -0.302458        0.259428        0.504410        0.708767   
1   2       -0.617372       -0.328065       -1.058266       -0.059498   
2   3        1.507346        0.906459        0.843167        0.737496   
3   4       -0.501316        0.456163        0.384292       -0.473012   
4   5        0.505661       -1.088959        1.204349        0.586745   

   BlendProperty5  BlendProperty6  BlendProperty7  BlendProperty8  \
0     

In [24]:
import joblib
import pandas as pd

# Load your original training data
df_train = pd.read_csv('train.csv')

# --- Save Imputation Means ---
# The features are the first 55 columns
feature_cols = df_train.columns[:55]
imputation_values = df_train[feature_cols].mean().to_dict()
joblib.dump(imputation_values, 'imputation_means.pkl')
print("Imputation means saved to 'imputation_means.pkl'")

# --- Save Outlier Bounds ---
outlier_bounds = {}
for col in feature_cols:
    Q1 = df_train[col].quantile(0.25)
    Q3 = df_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_bounds[col] = {'lower': lower_bound, 'upper': upper_bound}
joblib.dump(outlier_bounds, 'outlier_bounds.pkl')
print("Outlier bounds saved to 'outlier_bounds.pkl'")

Imputation means saved to 'imputation_means.pkl'
Outlier bounds saved to 'outlier_bounds.pkl'
