In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, r2_score

# Sample dataset
data = pd.DataFrame({
    'Region': ['North', 'South', 'East', 'West', 'North', 'South', 'East', 'West', 'North', 'South', 
               'East', 'West', 'North', 'South', 'East', 'West', 'North', 'South', 'East', 'West'],
    'Sales Rep': ['Alice', 'Bob', 'Carol', 'David', 'Eve', 'Alice', 'Bob', 'Carol', 'David', 'Eve',
                  'Alice', 'Bob', 'Carol', 'David', 'Eve', 'Alice', 'Bob', 'Carol', 'David', 'Eve'],
    'Customer': ['ExcelIsFun Corp', 'Tech Supplies Inc.', 'OfficeMart', 'AllThingsShop', 'BizWorks'] * 4,
    'Product': ['Widget A', 'Widget B', 'Widget C', 'Widget D', 'Widget E'] * 4,
    'COGS': [500, 300, 400, 250, 350, 600, 400, 300, 200, 350, 550, 450, 500, 400, 300, 700, 300, 350, 400, 250],
    'Sales': [700, 450, 600, 350, 500, 800, 700, 450, 300, 500, 750, 600, 700, 550, 400, 950, 500, 600, 700, 400]
})

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(data[['Region', 'Sales Rep', 'Customer', 'Product']])
encoded_columns = encoder.get_feature_names_out(['Region', 'Sales Rep', 'Customer', 'Product'])
encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)

# Combine encoded features with numerical data
final_data = pd.concat([encoded_df, data[['COGS', 'Sales']].reset_index(drop=True)], axis=1)

# Define features (X) and target (y)
X = final_data.drop(columns=['Sales'])
y = final_data['Sales']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("Mean Absolute Error (MAE):", mae)
print("R^2 Score:", r2)

# Optional: Display predictions alongside actual values
predictions = pd.DataFrame({
    'Actual Sales': y_test,
    'Predicted Sales': y_pred
})
print(predictions)


Mean Absolute Error (MAE): 55.97058823529102
R^2 Score: 0.889988013318543
    Actual Sales  Predicted Sales
0            700       656.058824
17           600       507.764706
15           950       894.058824
1            450       481.764706
