<a href="https://colab.research.google.com/github/damolathegreat/DamolaAgboola_Day8/blob/master/Linear_Regression_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Create the dataset
data = {
    'Co_solvent': ['MTBE', 'DIIPE', 'ACETONE', 'HEXANE', 'THF', 'DEE', 'DME', 'DIBE'],
    'pi_star': [0.30, 0.27, 0.71, -0.08, 0.58, 0.27, 0.12, 0.24],
    'alpha': [0.00, 0.00, 0.08, 0.00, 0.00, 0.00, 0.18, 0.00],
    'beta': [0.41, 0.49, 0.55, 0.00, 0.55, 0.47, 0.20, 0.41],
    'epsilon_r': [4.60, 4.3, 7.58, 1.89, 7.58, 4.33, 6.3, 4.20],
    'n': [1.369, 1.404, 1.36, 1.37, 1.41, 1.36, 1.31, 1.411],
    'ET30': [42.30, 34.1, 42.20, 31.00, 37.40, 34.50, 32.4, 33.0],
    'ETN': [0.284, 0.105, 0.35, 0.009, 0.207, 0.1170, 0.221, 0.071],
    'Yield': [97.7, 87.3, 97.91, 94.14, 96, 98.35, 99, 74.3]
}

df = pd.DataFrame(data)

def perform_linear_regression(X, y, feature_name):
    model = LinearRegression()
    X = X.reshape(-1, 1)
    model.fit(X, y)
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    equation = f"Yield = {model.coef_[0]:.4f}*{feature_name} + {model.intercept_:.4f}"
    return equation, r2

# Analyze each independent variable
independent_vars = ['pi_star', 'alpha', 'beta', 'epsilon_r', 'n', 'ET30', 'ETN']
results = []

for var in independent_vars:
    equation, r2 = perform_linear_regression(
        df[var].values,
        df['Yield'].values,
        var
    )
    results.append({
        'Variable': var,
        'Equation': equation,
        'R²': r2
    })

# Create results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R²', ascending=False)

print("Linear Regression Analysis Results:")
print("\nEquations sorted by R² value (best fit first):")
for _, row in results_df.iterrows():
    print(f"\n{row['Variable']}:")
    print(f"  {row['Equation']}")
    print(f"  R² = {row['R²']:.4f}")

Linear Regression Analysis Results:

Equations sorted by R² value (best fit first):

n:
  Yield = -161.9540*n + 315.6528
  R² = 0.4218

ETN:
  Yield = 38.9813*ETN + 86.4412
  R² = 0.2782

ET30:
  Yield = 0.7228*ET30 + 67.1674
  R² = 0.1378

alpha:
  Yield = 47.7776*alpha + 91.5347
  R² = 0.1377

epsilon_r:
  Yield = 1.4458*epsilon_r + 85.7174
  R² = 0.1094

pi_star:
  Yield = 5.7653*pi_star + 91.3507
  R² = 0.0283

beta:
  Yield = -2.8764*beta + 94.1949
  R² = 0.0042


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy import stats
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Create the dataset
data = {
    'pi_star': [0.30, 0.27, 0.71, -0.08, 0.58, 0.27, 0.12, 0.24],
    'alpha': [0.00, 0.00, 0.08, 0.00, 0.00, 0.00, 0.18, 0.00],
    'beta': [0.41, 0.49, 0.55, 0.00, 0.55, 0.47, 0.20, 0.41],
    'epsilon_r': [4.60, 4.3, 7.58, 1.89, 7.58, 4.33, 6.3, 4.20],
    'n': [1.369, 1.404, 1.36, 1.37, 1.41, 1.36, 1.31, 1.411],
    'ET30': [42.30, 34.1, 42.20, 31.00, 37.40, 34.50, 32.4, 33.0],
    'ETN': [0.284, 0.105, 0.35, 0.009, 0.207, 0.117, 0.221, 0.071],
    'Yield': [97.7, 87.3, 97.91, 94.14, 96, 98.35, 99, 74.3]
}

df = pd.DataFrame(data)

# 1. Correlation Analysis
correlation_matrix = df.corr()
print("1. CORRELATION ANALYSIS")
print("\nCorrelation with Yield:")
correlations = correlation_matrix['Yield'].sort_values(ascending=False)
for var, corr in correlations.items():
    if var != 'Yield':
        print(f"{var}: {corr:.4f}")

# 2. Multiple Linear Regression (All Variables)
X = df.drop('Yield', axis=1)
y = df['Yield']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Fit multiple linear regression
multi_model = LinearRegression()
multi_model.fit(X_scaled, y)

# Get predictions and R²
y_pred = multi_model.predict(X_scaled)
multi_r2 = r2_score(y, y_pred)
multi_rmse = np.sqrt(mean_squared_error(y, y_pred))

print("\n2. MULTIPLE LINEAR REGRESSION (ALL VARIABLES)")
print(f"\nOverall Model R²: {multi_r2:.4f}")
print(f"Root Mean Square Error: {multi_rmse:.4f}")
print("\nStandardized Coefficients:")
for var, coef in zip(X.columns, multi_model.coef_):
    print(f"{var}: {coef:.4f}")

# 3. Statistical Significance Tests
print("\n3. STATISTICAL SIGNIFICANCE TESTS")
print("\nT-tests for each variable vs Yield:")
for var in X.columns:
    t_stat, p_value = stats.pearsonr(df[var], df['Yield'])
    print(f"{var}:")
    print(f"  Correlation coefficient: {t_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")

# 4. Variable Importance Analysis
print("\n4. VARIABLE IMPORTANCE ANALYSIS")
importance = pd.DataFrame({
    'Variable': X.columns,
    'Absolute_Coefficient': np.abs(multi_model.coef_),
    'Original_Coefficient': multi_model.coef_
})
importance = importance.sort_values('Absolute_Coefficient', ascending=False)
print("\nVariables ranked by importance (absolute standardized coefficients):")
for _, row in importance.iterrows():
    print(f"{row['Variable']}: {row['Absolute_Coefficient']:.4f} (coefficient: {row['Original_Coefficient']:.4f})")

# 5. Model Summary
explained_variance = multi_r2 * 100
unexplained_variance = (1 - multi_r2) * 100

print("\n5. MODEL SUMMARY")
print(f"Total explained variance by all variables: {explained_variance:.1f}%")
print(f"Unexplained variance: {unexplained_variance:.1f}%")

1. CORRELATION ANALYSIS

Correlation with Yield:
ETN: 0.5274
ET30: 0.3712
alpha: 0.3710
epsilon_r: 0.3308
pi_star: 0.1683
beta: -0.0649
n: -0.6495

2. MULTIPLE LINEAR REGRESSION (ALL VARIABLES)

Overall Model R²: 1.0000
Root Mean Square Error: 0.0000

Standardized Coefficients:
pi_star: 39.1683
alpha: -71.0660
beta: -16.9316
epsilon_r: -49.9022
n: -3.0518
ET30: -151.9387
ETN: 184.9363

3. STATISTICAL SIGNIFICANCE TESTS

T-tests for each variable vs Yield:
pi_star:
  Correlation coefficient: 0.1683
  p-value: 0.6904
alpha:
  Correlation coefficient: 0.3710
  p-value: 0.3655
beta:
  Correlation coefficient: -0.0649
  p-value: 0.8786
epsilon_r:
  Correlation coefficient: 0.3308
  p-value: 0.4236
n:
  Correlation coefficient: -0.6495
  p-value: 0.0813
ET30:
  Correlation coefficient: 0.3712
  p-value: 0.3653
ETN:
  Correlation coefficient: 0.5274
  p-value: 0.1792

4. VARIABLE IMPORTANCE ANALYSIS

Variables ranked by importance (absolute standardized coefficients):
ETN: 184.9363 (coefficie

In [3]:
import pandas as pd
import statsmodels.api as sm

# Define the data
data = {
    "Co-solvent": ["MTBE", "DIIPE", "ACETONE", "HEXANE", "THF", "DEE", "DME", "DIBE"],
    "π*": [0.3, 0.27, 0.71, -0.08, 0.58, 0.27, 0.12, 0.24],
    "α": [0, 0, 0.08, 0, 0, 0, 0.18, 0],
    "β": [0.41, 0.49, 0.55, 0, 0.55, 0.47, 0.2, 0.41],
    "ϵr": [4.6, 4.3, 7.58, 1.89, 7.58, 4.33, 6.3, 4.2],
    "n": [1.369, 1.404, 1.36, 1.37, 1.41, 1.36, 1.31, 1.411],
    "ET(30)": [42.3, 34.1, 42.2, 31.0, 37.4, 34.5, 32.4, 33.0],
    "ETN": [0.284, 0.105, 0.35, 0.009, 0.207, 0.117, 0.221, 0.071],
    "Yield": [97.7, 87.3, 97.91, 94.14, 96, 98.35, 99, 74.3]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Define independent and dependent variables
X = df[["π*", "α", "β", "ϵr", "n", "ET(30)", "ETN"]]  # Features
y = df["Yield"]  # Target

# Add intercept
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Display results
print(model.summary())



                            OLS Regression Results                            
Dep. Variable:                  Yield   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Mon, 04 Nov 2024   Prob (F-statistic):                nan
Time:                        19:30:28   Log-Likelihood:                 203.51
No. Observations:                   8   AIC:                            -391.0
Df Residuals:                       0   BIC:                            -390.4
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1431.8201        inf          0        n

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Step 1: Create the dataset
data = {
    'pi_star': [0.30, 0.27, 0.71, -0.08, 0.58, 0.27, 0.12, 0.24],
    'alpha': [0.00, 0.00, 0.08, 0.00, 0.00, 0.00, 0.18, 0.00],
    'beta': [0.41, 0.49, 0.55, 0.00, 0.55, 0.47, 0.20, 0.41],
    'epsilon_r': [4.60, 4.3, 7.58, 1.89, 7.58, 4.33, 6.3, 4.20],
    'n': [1.369, 1.404, 1.36, 1.37, 1.41, 1.36, 1.31, 1.411],
    'ET30': [42.30, 34.1, 42.20, 31.00, 37.40, 34.50, 32.4, 33.0],
    'ETN': [0.284, 0.105, 0.35, 0.009, 0.207, 0.117, 0.221, 0.071],
    'Yield': [97.7, 87.3, 97.91, 94.14, 96, 98.35, 99, 74.3]
}

df = pd.DataFrame(data)

# Step 2: Separate features (X) and target variable (y)
X = df.drop('Yield', axis=1)
y = df['Yield']

# Step 3: Create and fit the multiple linear regression model
model = LinearRegression()
model.fit(X, y)

# Step 4: Get coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Step 5: Create equation string
variable_names = ['π*', 'α', 'β', 'ϵr', 'n', 'ET(30)', 'ETN']
equation_terms = []
equation_terms.append(f"{intercept:.4f}")

for name, coef in zip(variable_names, coefficients):
    if coef >= 0:
        equation_terms.append(f"+ {coef:.4f}*{name}")
    else:
        equation_terms.append(f"- {abs(coef):.4f}*{name}")

full_equation = " ".join(equation_terms)

# Print results
print("MULTIPLE LINEAR REGRESSION ANALYSIS FOR BIODIESEL YIELD\n")
print("Coefficients for each variable:")
for name, coef in zip(X.columns, coefficients):
    print(f"{name}: {coef:.4f}")

print("\nIntercept (b₀):", f"{intercept:.4f}")

print("\nComplete Equation:")
print("Yield =", full_equation)

# Calculate R-squared
r2 = model.score(X, y)
print(f"\nR² value: {r2:.4f}")

# Calculate predicted values
y_pred = model.predict(X)

# Calculate prediction accuracy
print("\nPrediction Analysis:")
print("\nActual vs Predicted Values:")
for actual, pred in zip(y, y_pred):
    print(f"Actual: {actual:.2f}, Predicted: {pred:.2f}, Difference: {(actual-pred):.2f}")

# Calculate Mean Absolute Error
mae = np.mean(np.abs(y - y_pred))
print(f"\nMean Absolute Error: {mae:.4f}")

MULTIPLE LINEAR REGRESSION ANALYSIS FOR BIODIESEL YIELD

Coefficients for each variable:
pi_star: 169.2036
alpha: -1153.7922
beta: -94.5769
epsilon_r: -27.5027
n: -95.9460
ET30: -37.2977
ETN: 1723.4161

Intercept (b₀): 1431.8201

Complete Equation:
Yield = 1431.8201 + 169.2036*π* - 1153.7922*α - 94.5769*β - 27.5027*ϵr - 95.9460*n - 37.2977*ET(30) + 1723.4161*ETN

R² value: 1.0000

Prediction Analysis:

Actual vs Predicted Values:
Actual: 97.70, Predicted: 97.70, Difference: -0.00
Actual: 87.30, Predicted: 87.30, Difference: 0.00
Actual: 97.91, Predicted: 97.91, Difference: -0.00
Actual: 94.14, Predicted: 94.14, Difference: 0.00
Actual: 96.00, Predicted: 96.00, Difference: -0.00
Actual: 98.35, Predicted: 98.35, Difference: 0.00
Actual: 99.00, Predicted: 99.00, Difference: 0.00
Actual: 74.30, Predicted: 74.30, Difference: 0.00

Mean Absolute Error: 0.0000


In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Step 1: Let's organize our data neatly
print("STEP 1: Looking at our data")
data = {
    'pi_star': [0.30, 0.27, 0.71, -0.08, 0.58, 0.27, 0.12, 0.24],
    'alpha': [0.00, 0.00, 0.08, 0.00, 0.00, 0.00, 0.18, 0.00],
    'beta': [0.41, 0.49, 0.55, 0.00, 0.55, 0.47, 0.20, 0.41],
    'epsilon_r': [4.60, 4.3, 7.58, 1.89, 7.58, 4.33, 6.3, 4.20],
    'n': [1.369, 1.404, 1.36, 1.37, 1.41, 1.36, 1.31, 1.411],
    'ET30': [42.30, 34.1, 42.20, 31.00, 37.40, 34.50, 32.4, 33.0],
    'ETN': [0.284, 0.105, 0.35, 0.009, 0.207, 0.117, 0.221, 0.071],
    'Yield': [97.7, 87.3, 97.91, 94.14, 96, 98.35, 99, 74.3]
}

df = pd.DataFrame(data)
print("\nFirst few rows of our data:")
print(df.head(3))

# Step 2: Separate our input parameters (X) from what we want to predict (y)
print("\nSTEP 2: Separating parameters and yield")
X = df.drop('Yield', axis=1)  # All parameters except Yield
y = df['Yield']               # Only Yield

# Step 3: Create and train our model
print("\nSTEP 3: Creating our equation")
model = LinearRegression()
model.fit(X, y)

# Step 4: Get the coefficients and intercept
print("\nSTEP 4: Getting the coefficients")
coefficients = dict(zip(X.columns, model.coef_))
print("\nHow much each parameter affects yield:")
for param, coef in coefficients.items():
    print(f"{param}: {coef:.4f}")

print(f"\nStarting point (intercept): {model.intercept_:.4f}")

# Step 5: Write out the full equation
print("\nSTEP 5: Final Equation")
equation_parts = [f"{model.intercept_:.4f}"]
for param, coef in coefficients.items():
    if coef >= 0:
        equation_parts.append(f"+ {coef:.4f}×{param}")
    else:
        equation_parts.append(f"- {abs(coef):.4f}×{param}")

equation = " ".join(equation_parts)
print("\nYield =", equation)

# Step 6: Let's test our equation with the first data point
print("\nSTEP 6: Testing our equation")
first_data = df.iloc[0]  # Get first row of data
print("\nLet's predict the yield for MTBE (first data point):")
print("\nUsing these values:")
for param in X.columns:
    print(f"{param}: {first_data[param]}")

predicted_yield = model.predict([first_data[X.columns]])[0]
actual_yield = first_data['Yield']
print(f"\nPredicted Yield: {predicted_yield:.2f}%")
print(f"Actual Yield: {actual_yield}%")
print(f"Difference: {abs(actual_yield - predicted_yield):.2f}%")

# Step 7: Check how good our equation is
print("\nSTEP 7: Checking equation accuracy")
r2 = model.score(X, y)
print(f"\nAccuracy (R² value): {r2:.4f}")
print("This means our equation explains {:.1f}% of the variation in yield".format(r2*100))

STEP 1: Looking at our data

First few rows of our data:
   pi_star  alpha  beta  epsilon_r      n  ET30    ETN  Yield
0     0.30   0.00  0.41       4.60  1.369  42.3  0.284  97.70
1     0.27   0.00  0.49       4.30  1.404  34.1  0.105  87.30
2     0.71   0.08  0.55       7.58  1.360  42.2  0.350  97.91

STEP 2: Separating parameters and yield

STEP 3: Creating our equation

STEP 4: Getting the coefficients

How much each parameter affects yield:
pi_star: 169.2036
alpha: -1153.7922
beta: -94.5769
epsilon_r: -27.5027
n: -95.9460
ET30: -37.2977
ETN: 1723.4161

Starting point (intercept): 1431.8201

STEP 5: Final Equation

Yield = 1431.8201 + 169.2036×pi_star - 1153.7922×alpha - 94.5769×beta - 27.5027×epsilon_r - 95.9460×n - 37.2977×ET30 + 1723.4161×ETN

STEP 6: Testing our equation

Let's predict the yield for MTBE (first data point):

Using these values:
pi_star: 0.3
alpha: 0.0
beta: 0.41
epsilon_r: 4.6
n: 1.369
ET30: 42.3
ETN: 0.284

Predicted Yield: 97.70%
Actual Yield: 97.7%
Differen