In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import curve_fit
import seaborn as sns
from groq import Groq
import fpdf
from fpdf import FPDF
import os
from dotenv import load_dotenv

In [None]:
# Load the dataset
file_path = './scr-dataset.csv'
data = pd.read_csv(file_path)

In [None]:
# Display basic information about the dataset
print("Dataset shape:", data.shape)
print("\nFirst few rows of the dataset:")
print(data.head())

In [None]:
print("\nBasic statistics:")
print(data.describe())

In [None]:
# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

In [None]:
# Exploratory Data Analysis
plt.figure(figsize=(12, 6))
plt.scatter(data['x'], data['y'], alpha=0.6)
plt.title('Scatter Plot of x vs y')
plt.xlabel('x')
plt.ylabel('y')
plt.grid(True, alpha=0.3)
plt.savefig('scatter_plot.png')
plt.close()

In [None]:
# The data appears to have a sinusoidal pattern, let's try to fit a sine function
def sine_function(x, amplitude, frequency, phase, offset):
    return amplitude * np.sin(frequency * x + phase) + offset

In [None]:
# Initial parameter guess
p0 = [1.0, 1.0, 0.0, 0.0]  # [amplitude, frequency, phase, offset]

In [None]:
# Fit the sine function
try:
    params, params_covariance = curve_fit(sine_function, data['x'], data['y'], p0=p0)
    print("\nSine function parameters:")
    print(f"Amplitude: {params[0]:.4f}")
    print(f"Frequency: {params[1]:.4f}")
    print(f"Phase: {params[2]:.4f}")
    print(f"Offset: {params[3]:.4f}")
    
    # Generate predictions using the fitted sine function
    y_pred_sine = sine_function(data['x'], *params)
    
    # Calculate metrics for sine function
    mse_sine = mean_squared_error(data['y'], y_pred_sine)
    r2_sine = r2_score(data['y'], y_pred_sine)
    
    print(f"\nSine function - Mean Squared Error: {mse_sine:.4f}")
    print(f"Sine function - R² Score: {r2_sine:.4f}")
    
    # Plot the sine function fit
    plt.figure(figsize=(12, 6))
    plt.scatter(data['x'], data['y'], alpha=0.6, label='Data')
    
    # Generate a smooth curve for the fitted function
    x_smooth = np.linspace(min(data['x']), max(data['x']), 1000)
    y_smooth = sine_function(x_smooth, *params)
    plt.plot(x_smooth, y_smooth, 'r-', label='Fitted Sine Function')
    
    plt.title('Sine Function Fit')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('sine_fit.png')
    plt.close()

In [None]:
except RuntimeError as e:
    print(f"Error fitting sine function: {e}")

In [None]:
# Let's also try polynomial regression with different degrees
degrees = [3, 5, 7, 9, 11]
plt.figure(figsize=(14, 10))

In [None]:
poly_models = {}  # Store the models for later prediction

In [None]:
for i, degree in enumerate(degrees, 1):
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree)
    X_poly = poly_features.fit_transform(data[['x']])
    
    # Fit polynomial regression
    poly_model = LinearRegression()
    poly_model.fit(X_poly, data['y'])
    
    # Store the model and features for later prediction
    poly_models[degree] = (poly_model, poly_features)
    
    # Generate predictions
    y_pred_poly = poly_model.predict(X_poly)
    
    # Calculate metrics
    mse_poly = mean_squared_error(data['y'], y_pred_poly)
    r2_poly = r2_score(data['y'], y_pred_poly)
    
    print(f"\nPolynomial (degree {degree}) - Mean Squared Error: {mse_poly:.4f}")
    print(f"Polynomial (degree {degree}) - R² Score: {r2_poly:.4f}")
    
    # Plot polynomial fit
    plt.subplot(len(degrees), 1, i)
    plt.scatter(data['x'], data['y'], alpha=0.4, label='Data')
    
    # Sort the data for smooth plotting
    sort_idx = np.argsort(data['x'])
    plt.plot(data['x'].iloc[sort_idx], y_pred_poly[sort_idx], 'r-', 
             label=f'Polynomial (degree {degree})')
    
    plt.title(f'Polynomial Regression (degree {degree})')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend()
    plt.grid(True, alpha=0.3)

In [None]:
plt.tight_layout()
plt.savefig('polynomial_fits.png')
plt.close()

In [None]:
# Let's try a more complex model: a damped sine wave
def damped_sine(x, amplitude, frequency, phase, offset, decay):
    return amplitude * np.sin(frequency * x + phase) * np.exp(-decay * x) + offset

In [None]:
# Initial parameter guess for damped sine
p0_damped = [1.0, 1.0, 0.0, 0.0, 0.1]  # [amplitude, frequency, phase, offset, decay]

In [None]:
try:
    params_damped, _ = curve_fit(damped_sine, data['x'], data['y'], p0=p0_damped)
    print("\nDamped sine function parameters:")
    print(f"Amplitude: {params_damped[0]:.4f}")
    print(f"Frequency: {params_damped[1]:.4f}")
    print(f"Phase: {params_damped[2]:.4f}")
    print(f"Offset: {params_damped[3]:.4f}")
    print(f"Decay: {params_damped[4]:.4f}")
    
    # Generate predictions using the fitted damped sine function
    y_pred_damped = damped_sine(data['x'], *params_damped)
    
    # Calculate metrics for damped sine function
    mse_damped = mean_squared_error(data['y'], y_pred_damped)
    r2_damped = r2_score(data['y'], y_pred_damped)
    
    print(f"\nDamped sine function - Mean Squared Error: {mse_damped:.4f}")
    print(f"Damped sine function - R² Score: {r2_damped:.4f}")
    
    # Plot the damped sine function fit
    plt.figure(figsize=(12, 6))
    plt.scatter(data['x'], data['y'], alpha=0.6, label='Data')
    
    # Generate a smooth curve for the fitted function
    x_smooth = np.linspace(min(data['x']), max(data['x']), 1000)
    y_smooth = damped_sine(x_smooth, *params_damped)
    plt.plot(x_smooth, y_smooth, 'r-', label='Fitted Damped Sine Function')
    
    plt.title('Damped Sine Function Fit')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('damped_sine_fit.png')
    plt.close()

In [None]:
except RuntimeError as e:
    print(f"Error fitting damped sine function: {e}")

In [None]:
# Compare all models in one plot
plt.figure(figsize=(14, 8))
plt.scatter(data['x'], data['y'], alpha=0.6, label='Data')

In [None]:
# Sort data for smooth plotting
sort_idx = np.argsort(data['x'])
x_sorted = data['x'].iloc[sort_idx].values
x_smooth = np.linspace(min(data['x']), max(data['x']), 1000)

In [None]:
# Plot sine function
try:
    y_smooth_sine = sine_function(x_smooth, *params)
    plt.plot(x_smooth, y_smooth_sine, 'r-', label='Sine Function')
except NameError:
    pass

In [None]:
# Plot damped sine function
try:
    y_smooth_damped = damped_sine(x_smooth, *params_damped)
    plt.plot(x_smooth, y_smooth_damped, 'g-', label='Damped Sine Function')
except NameError:
    pass

In [None]:
# Plot best polynomial fit (using the highest degree)
try:
    best_degree = degrees[-1]
    poly_model, poly_features = poly_models[best_degree]
    
    X_smooth_poly = poly_features.transform(x_smooth.reshape(-1, 1))
    y_smooth_poly = poly_model.predict(X_smooth_poly)
    
    plt.plot(x_smooth, y_smooth_poly, 'b-', label=f'Polynomial (degree {best_degree})')
except Exception as e:
    print(f"Error plotting polynomial: {e}")

In [None]:
plt.title('Comparison of Different Models')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('model_comparison.png')
plt.close()

In [None]:
# Print the best model based on R² score
models = []
try:
    models.append(('Sine Function', r2_sine))
except NameError:
    pass

In [None]:
try:
    models.append(('Damped Sine Function', r2_damped))
except NameError:
    pass

In [None]:
for degree in degrees:
    poly_model, poly_features = poly_models[degree]
    X_poly = poly_features.transform(data[['x']].values)
    y_pred_poly = poly_model.predict(X_poly)
    r2_poly = r2_score(data['y'], y_pred_poly)
    models.append((f'Polynomial (degree {degree})', r2_poly))

In [None]:
if models:
    best_model = max(models, key=lambda x: x[1])
    print(f"\nBest model: {best_model[0]} with R² Score: {best_model[1]:.4f}")

In [None]:
# Function to predict y for any x value using all models
def predict_for_x(x_value):
    print(f"\n--- Prediction for x = {x_value} ---")
    
    predictions = {}
    
    # Prediction using sine function
    try:
        y_pred_sine = sine_function(x_value, *params)
        print(f"Sine function prediction: y = {y_pred_sine:.4f}")
        predictions['Sine Function'] = y_pred_sine
    except NameError:
        print("Sine function model not available")
    
    # Prediction using damped sine function
    try:
        y_pred_damped = damped_sine(x_value, *params_damped)
        print(f"Damped sine function prediction: y = {y_pred_damped:.4f}")
        predictions['Damped Sine Function'] = y_pred_damped
    except NameError:
        print("Damped sine function model not available")
    
    # Prediction using polynomial models
    for degree in degrees:
        try:
            poly_model, poly_features = poly_models[degree]
            X_poly = poly_features.transform([[x_value]])
            y_pred_poly = poly_model.predict(X_poly)[0]
            print(f"Polynomial (degree {degree}) prediction: y = {y_pred_poly:.4f}")
            predictions[f'Polynomial (degree {degree})'] = y_pred_poly
        except Exception as e:
            print(f"Error predicting with polynomial (degree {degree}): {e}")
    
    # Identify the best model's prediction
    if best_model[0] in predictions:
        print(f"\nBest model ({best_model[0]}) prediction: y = {predictions[best_model[0]]:.4f}")
    
    return predictions

In [None]:
# Allow user to input x values for prediction
def interactive_prediction():
    while True:
        try:
            user_input = input("\nEnter an x value to predict y (or 'q' to quit): ")
            
            if user_input.lower() == 'q':
                break
                
            x_value = float(user_input)
            predictions = predict_for_x(x_value)
            
            # Visualize this prediction
            plt.figure(figsize=(14, 8))
            plt.scatter(data['x'], data['y'], alpha=0.6, label='Data')
            
            # Extended range for visualization
            min_x = min(min(data['x']), x_value - 5)
            max_x = max(max(data['x']), x_value + 5)
            x_extended = np.linspace(min_x, max_x, 1000)
            
            # Plot sine function with extended range
            try:
                y_extended_sine = sine_function(x_extended, *params)
                plt.plot(x_extended, y_extended_sine, 'r-', label='Sine Function')
                plt.scatter(x_value, sine_function(x_value, *params), color='red', s=100, 
                            marker='x')
            except NameError:
                pass
            
            # Plot damped sine function with extended range
            try:
                y_extended_damped = damped_sine(x_extended, *params_damped)
                plt.plot(x_extended, y_extended_damped, 'g-', label='Damped Sine Function')
                plt.scatter(x_value, damped_sine(x_value, *params_damped), color='green', s=100, 
                            marker='x')
            except NameError:
                pass
            
            # Plot best polynomial with extended range
            try:
                best_degree = degrees[-1]
                poly_model, poly_features = poly_models[best_degree]
                
                X_extended_poly = poly_features.transform(x_extended.reshape(-1, 1))
                y_extended_poly = poly_model.predict(X_extended_poly)
                
                plt.plot(x_extended, y_extended_poly, 'b-', label=f'Polynomial (degree {best_degree})')
                
                X_poly_point = poly_features.transform([[x_value]])
                y_poly_point = poly_model.predict(X_poly_point)[0]
                plt.scatter(x_value, y_poly_point, color='blue', s=100, marker='x')
            except Exception as e:
                print(f"Error plotting extended polynomial: {e}")
            
            plt.axvline(x=x_value, color='gray', linestyle='--', alpha=0.5)
            plt.title(f'Model Predictions with Highlight at x = {x_value}')
            plt.xlabel('x')
            plt.ylabel('y')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.savefig(f'prediction_x_{x_value}.png')
            plt.close()
            
        except ValueError:
            print("Please enter a valid number or 'q' to quit.")
        except Exception as e:
            print(f"An error occurred: {e}")

In [None]:
# First, let's predict for x = 50 as an example
predict_for_x(50)

In [None]:
# Then, let's allow interactive predictions
print("\n\nYou can now predict y for any x value interactively.")
interactive_prediction()

In [None]:
# Save the models for later use
import pickle

In [None]:
# Save the models to files
try:
    # Save sine function parameters
    with open('sine_model.pkl', 'wb') as f:
        pickle.dump(params, f)
    print("Sine function model saved to 'sine_model.pkl'")
except NameError:
    print("Sine function model not available for saving")

In [None]:
try:
    # Save damped sine function parameters
    with open('damped_sine_model.pkl', 'wb') as f:
        pickle.dump(params_damped, f)
    print("Damped sine function model saved to 'damped_sine_model.pkl'")
except NameError:
    print("Damped sine function model not available for saving")

In [None]:
# Save polynomial models
for degree in degrees:
    try:
        with open(f'poly_model_degree_{degree}.pkl', 'wb') as f:
            pickle.dump(poly_models[degree], f)
        print(f"Polynomial model (degree {degree}) saved to 'poly_model_degree_{degree}.pkl'")
    except Exception as e:
        print(f"Error saving polynomial model (degree {degree}): {e}")

In [None]:
print("\nAll models have been saved. You can load them later for predictions without retraining.")

In [None]:
# Function to generate a summary of results using Groq API
def generate_analysis_with_groq(model_results, predictions):
    load_dotenv()  # Load API key from .env file
    
    # Initialize Groq client
    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
    
    # Create a prompt with the model results
    prompt = f"""
    I have fitted several models to a dataset with x and y values. Here are the results:
    
    Dataset Information:
    - Shape: {data.shape}
    - X range: {data['x'].min()} to {data['x'].max()}
    - Y range: {data['y'].min()} to {data['y'].max()}
    
    Model Performance:
    {model_results}
    
    Predictions for x = 50:
    {predictions}
    
    Please analyze these results and explain:
    1. Which model performed best and why?
    2. What pattern does the data follow?
    3. Why do some models perform poorly for extrapolation?
    4. What would be your recommendation for making predictions with this data?
    
    Format your response in a clear, professional way suitable for a technical report.
    """
    
    # Call Groq API
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192",  # You can change the model as needed
    )
    
    # Return the generated analysis
    return chat_completion.choices[0].message.content

In [None]:
# Function to create a PDF report
def create_pdf_report(analysis, model_results, predictions, best_model_name):
    pdf = FPDF()
    pdf.add_page()
    
    # Set up fonts
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, "Data Analysis and Model Prediction Report", ln=True, align="C")
    pdf.ln(5)
    
    # Add date
    pdf.set_font("Arial", "I", 10)
    from datetime import datetime
    pdf.cell(0, 10, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True)
    pdf.ln(5)
    
    # Dataset information
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, "Dataset Information:", ln=True)
    pdf.set_font("Arial", "", 10)
    pdf.multi_cell(0, 5, f"Shape: {data.shape}\nX range: {data['x'].min()} to {data['x'].max()}\nY range: {data['y'].min():.4f} to {data['y'].max():.4f}")
    pdf.ln(5)
    
    # Model performance
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, "Model Performance:", ln=True)
    pdf.set_font("Arial", "", 10)
    pdf.multi_cell(0, 5, model_results)
    pdf.ln(5)
    
    # Best model
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, f"Best Model: {best_model_name}", ln=True)
    pdf.ln(5)
    
    # Predictions
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, "Predictions for x = 50:", ln=True)
    pdf.set_font("Arial", "", 10)
    pdf.multi_cell(0, 5, predictions)
    pdf.ln(5)
    
    # Add images
    for img in ['scatter_plot.png', 'sine_fit.png', 'damped_sine_fit.png', 'model_comparison.png']:
        if os.path.exists(img):
            pdf.add_page()
            pdf.set_font("Arial", "B", 12)
            pdf.cell(0, 10, f"Figure: {img.replace('_', ' ').replace('.png', '')}", ln=True)
            pdf.image(img, x=10, y=30, w=180)
    
    # Analysis from Groq
    pdf.add_page()
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "AI Analysis of Results:", ln=True)
    pdf.ln(5)
    pdf.set_font("Arial", "", 10)
    
    # Split the analysis into paragraphs and add to PDF
    paragraphs = analysis.split('\n\n')
    for paragraph in paragraphs:
        if paragraph.strip():
            pdf.multi_cell(0, 5, paragraph.strip())
            pdf.ln(3)
    
    # Save the PDF
    pdf_path = "model_analysis_report.pdf"
    pdf.output(pdf_path)
    print(f"\nReport saved as {pdf_path}")
    return pdf_path

In [None]:
# After your model evaluation code, add this:
if models:
    # Format model results as text
    model_results_text = "\n".join([f"{name}: R² Score = {score:.4f}" for name, score in models])
    
    # Format predictions as text
    predictions_text = ""
    try:
        predictions_text += f"Sine Function: {sine_function(50, *params):.4f}\n"
    except NameError:
        pass
    
    try:
        predictions_text += f"Damped Sine Function: {damped_sine(50, *params_damped):.4f}\n"
    except NameError:
        pass
    
    for degree in degrees:
        try:
            poly_model, poly_features = poly_models[degree]
            X_poly = poly_features.transform([[50]])
            y_pred = poly_model.predict(X_poly)[0]
            predictions_text += f"Polynomial (degree {degree}): {y_pred:.4f}\n"
        except Exception:
            pass
    
    # Generate analysis with Groq
    print("\nGenerating analysis with Groq API...")
    analysis = generate_analysis_with_groq(model_results_text, predictions_text)
    
    # Create PDF report
    print("Creating PDF report...")
    pdf_path = create_pdf_report(analysis, model_results_text, predictions_text, best_model[0])
    
    print(f"Analysis complete! PDF report saved as {pdf_path}")