In [None]:
# !pip install pandas
# !pip install scikit-learn
# !pip install numpy
# !pip install tensorflow

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [2]:
boxmod_data = pd.read_csv('./data/BoxMod.csv', header=None)
juul_data = pd.read_csv('./data/JUUL.csv', header=None)

In [3]:
boxmod_data.head()
# boxmod_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,3510.0,1400.0,1090.0,2330.0,1920.0,1180.0,1100.0,1860.0,2170.0,1900.0,...,915.0,2050.0,2420.0,2080.0,1930.0,2310.0,1700.0,1710.0,2150.0,1990.0
1,3330.0,1380.0,1000.0,2060.0,1980.0,1090.0,1050.0,1760.0,2240.0,1830.0,...,845.0,1860.0,2280.0,2020.0,1820.0,2130.0,1610.0,1470.0,2020.0,1800.0
2,3140.0,1300.0,1130.0,1910.0,1850.0,988.0,1000.0,1540.0,2030.0,1610.0,...,801.0,1910.0,1940.0,1920.0,1650.0,1890.0,1430.0,1370.0,1890.0,1710.0
3,3030.0,1140.0,757.0,1790.0,1760.0,935.0,949.0,1660.0,1810.0,1750.0,...,771.0,1710.0,1800.0,1840.0,1570.0,1780.0,1390.0,1250.0,1650.0,1620.0
4,2790.0,1240.0,673.0,1830.0,1530.0,980.0,922.0,1450.0,1710.0,1570.0,...,655.0,1640.0,1820.0,1620.0,1600.0,1780.0,1380.0,1350.0,1640.0,1630.0


In [4]:
juul_data
# juul_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,102.0,37.0,4.12,27.5,30.5,38.0,29.3,29.80,35.3,40.9,...,53.8,46.8,37.8,31.6,24.1,31.4,46.2,35.3,33.1,33.4
1,102.0,21.1,23.10,26.4,28.9,37.0,28.0,6.47,32.1,40.9,...,37.2,41.2,33.6,29.5,21.2,24.4,29.4,43.1,31.7,25.3
2,78.1,36.1,21.10,24.5,26.2,33.6,27.0,27.60,29.7,39.6,...,35.4,35.6,30.2,27.4,30.6,23.8,43.6,41.5,30.6,24.3
3,62.4,35.5,20.30,25.6,24.8,31.2,23.8,24.40,31.1,38.1,...,33.4,34.9,31.1,24.6,20.3,23.2,41.5,40.6,21.6,23.5
4,55.9,23.7,-2.50,22.9,23.2,32.1,24.6,23.30,30.3,37.0,...,47.6,27.6,29.1,23.5,19.6,25.7,39.1,29.1,28.4,22.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,447.0,78.4,159.00,135.0,272.0,199.0,157.0,158.00,160.0,189.0,...,174.0,234.0,270.0,263.0,160.0,201.0,278.0,311.0,307.0,259.0
1313,332.0,70.8,196.00,102.0,164.0,111.0,100.0,109.00,95.8,116.0,...,109.0,129.0,207.0,195.0,91.8,219.0,209.0,280.0,179.0,193.0
1314,239.0,54.0,125.00,93.4,157.0,96.1,102.0,120.00,76.1,93.4,...,84.1,121.0,167.0,146.0,102.0,116.0,157.0,155.0,164.0,147.0
1315,184.0,38.8,74.30,57.1,126.0,81.4,78.1,69.70,72.7,91.1,...,96.1,94.9,118.0,109.0,65.7,85.4,130.0,118.0,133.0,112.0


In [5]:
X_boxmod = boxmod_data.iloc[:, 1:]
y_boxmod = boxmod_data.iloc[:, 0] 

In [6]:
# X_boxmod.head()

In [7]:
X_juul = juul_data.iloc[:, 1:]
y_juul = juul_data.iloc[:, 0]

In [8]:
X_boxmod_train, X_boxmod_test, y_boxmod_train, y_boxmod_test = train_test_split(X_boxmod, y_boxmod, test_size=0.3, random_state=42)

In [9]:
X_juul_train, X_juul_test, y_juul_train, y_juul_test = train_test_split(X_juul, y_juul, test_size=0.3, random_state=42)

In [10]:
scaler = StandardScaler()
X_boxmod_train_scaled = scaler.fit_transform(X_boxmod_train)
X_boxmod_test_scaled = scaler.transform(X_boxmod_test)
X_juul_train_scaled = scaler.transform(X_juul_train)
X_juul_test_scaled = scaler.transform(X_juul_test)

In [11]:
def evaluation(model, X_test_scaled, y_test, dataset_name):
    predictions = model.predict(X_test_scaled)
    
    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    print(f"{dataset_name} - R2 Score:", r2)
    print(f"{dataset_name} - RMSE:", rmse)

def evaluation_cross(model, X_test_scaled, y_test, dataset_name):
    cv_scores = cross_val_score(model, X_test_scaled, y_test, cv=5, scoring='r2')

    # Calculate RMSE for each fold
    rmse_scores = np.sqrt(-cross_val_score(model, X_test_scaled, y_test, cv=5, scoring='neg_mean_squared_error'))
    
    print(f"Average R2 Score for {dataset_name}:", np.mean(cv_scores))
    print(f"Average RMSE for {dataset_name} data:", np.mean(rmse_scores))

In [12]:
model = RandomForestRegressor()

In [13]:
# Train the model on BoxMod training data
model.fit(X_boxmod_train_scaled, y_boxmod_train)

In [14]:
evaluation(model, X_boxmod_test_scaled, y_boxmod_test, "BoxMod")

BoxMod - R2 Score: 0.9665683652583067
BoxMod - RMSE: 398.8169325379025


In [15]:
evaluation_cross(model, X_boxmod_test_scaled, y_boxmod_test, "BoxMod")

Average R2 Score for BoxMod: 0.94554365441811
Average RMSE for BoxMod data: 502.05634650414675


In [16]:
evaluation(model, X_juul_test_scaled, y_juul_test, "JUUL")

JUUL - R2 Score: 0.6956784770862996
JUUL - RMSE: 174.6010915924428


In [17]:
# evaluation_cross(model, X_juul_test_scaled, y_juul_test, "JUUL")

In [18]:
# Fine-tuning on JUUL training data
model.fit(X_juul_train_scaled, y_juul_train)

In [19]:
evaluation(model, X_juul_test_scaled, y_juul_test, "JUUL")

JUUL - R2 Score: 0.9796580912554063
JUUL - RMSE: 45.14155041384553


In [20]:
evaluation_cross(model, X_juul_test_scaled, y_juul_test, "JUUL")

Average R2 Score for JUUL: 0.9683487281995712
Average RMSE for JUUL data: 55.61974430680273


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.model_selection import KFold

In [22]:
scaler = StandardScaler()
X_boxmod_scaled = scaler.fit_transform(X_boxmod)
X_juul_scaled = scaler.fit_transform(X_juul)

In [23]:
def perform_kfold_cv(X, y):
    kfold = KFold(5, shuffle=True)
    mse_scores = []
    r2_scores = []
    
    for train, test in kfold.split(X, y):
        # Create the model
        model = Sequential()
        model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='linear'))  # Linear activation for regression
    
        # Compile the model
        model.compile(optimizer='adam', loss='mean_squared_error')
    
        # Fit the model
        model.fit(X[train], y[train], epochs=100, batch_size=32, verbose=0)
    
        # Evaluate the model
        y_pred = model.predict(X[test]).flatten()
        mse_scores.append(mean_squared_error(y[test], y_pred))
        r2_scores.append(r2_score(y[test], y_pred))

    
    # Calculate average and standard deviation of MSE
    average_mse = np.mean(mse_scores)
    std_dev_mse = np.std(mse_scores)
    average_r2 = np.mean(r2_scores)
    std_dev_r2 = np.std(r2_scores)

    
    print("Average MSE:", average_mse)
    print("Standard Deviation of MSE:", std_dev_mse)
    print("Average R2:", average_r2)
    print("Standard Deviation of R2:", std_dev_r2)


In [24]:
perform_kfold_cv(X_boxmod_scaled, y_boxmod)

Average MSE: 280730.4156375802
Standard Deviation of MSE: 59092.42862527706
Average R2: 0.9469836801846471
Standard Deviation of R2: 0.0074044386681163315


In [25]:
perform_kfold_cv(X_juul_scaled, y_juul)

Average MSE: 2507.426806499519
Standard Deviation of MSE: 508.4259370874592
Average R2: 0.975035603032584
Standard Deviation of R2: 0.004710411355313271


In [26]:
def train_base_model(X, y):
    model = Sequential()
    model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X, y, epochs=100, batch_size=32, verbose=0)
    return model

In [27]:
def perform_transfer_learning_kfold_cv(X_juul, y_juul, base_model):
    kfold = KFold(5, shuffle=True)
    mse_scores = []
    r2_scores = []

    for train, test in kfold.split(X_juul, y_juul):
        # Clone the base model structure and weights
        model = Sequential(base_model.layers[:-1])  # Exclude the last layer
        model.add(Dense(1, activation='linear'))  # Add new output layer
        model.compile(optimizer='adam', loss='mean_squared_error')

        # Fine-tune the model on JUUL data
        model.fit(X_juul[train], y_juul[train], epochs=50, batch_size=32, verbose=0)  # Fewer epochs for fine-tuning

        # Evaluate the model
        y_pred = model.predict(X_juul[test]).flatten()
        mse_scores.append(mean_squared_error(y_juul[test], y_pred))
        r2_scores.append(r2_score(y_juul[test], y_pred))

    average_mse = np.mean(mse_scores)
    std_dev_mse = np.std(mse_scores)
    average_r2 = np.mean(r2_scores)
    std_dev_r2 = np.std(r2_scores)

    print("Average MSE:", average_mse)
    print("Standard Deviation of MSE:", std_dev_mse)
    print("Average R2:", average_r2)
    print("Standard Deviation of R2:", std_dev_r2)

In [28]:
base_model = train_base_model(X_boxmod_scaled, y_boxmod)

In [29]:
perform_transfer_learning_kfold_cv(X_juul_scaled, y_juul, base_model)

Average MSE: 2615.7219698864737
Standard Deviation of MSE: 423.4040866428747
Average R2: 0.9736367332480758
Standard Deviation of R2: 0.004919168558411501
