In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = '/content/drive/MyDrive/sales_data_with_issues.csv'
data = pd.read_csv(data_path)
data

Unnamed: 0,Category_Technology,Category_Office Supplies,Category_Furniture,City,Sales
0,0,0,1,washington,832.698188
1,1,0,0,houston,832.584084
2,0,0,1,los angeles,793.030324
3,0,0,1,phoenix,876.387212
4,0,1,0,new york,847.019964
...,...,...,...,...,...
5005,0,1,0,phoenix,926.859340
5006,1,0,0,san jose,981.280420
5007,0,1,0,san francisco,752.365337
5008,1,0,0,denver,894.706730


In [None]:
unique_cities = data['City'].unique()

# Display the unique cities
print("Unique cities in the dataset:")
for city in unique_cities:
    print(city)


Unique cities in the dataset:
washington
houston
los angeles
phoenix
new york
philadelphia
san antonio
san diego
charlotte
fort worth
seattle
san jose
jacksonville
nan
chicago
denver
dallas
columbus
san francisco
austin
indianapolis
CHARLOTTE
PHILADELPHIA
JACKSONVILLE
FORT WORTH
AUSTIN
SAN JOSE
PHOENIX
SEATTLE
DENVER
DALLAS
INDIANAPOLIS
HOUSTON


In [None]:


# Data Cleaning
data['Sales'].fillna(data['Sales'].median(), inplace=True)  # Replace missing sales with median
data['City'].fillna('Unknown', inplace=True)  # Replace missing cities with 'Unknown'
data.drop_duplicates(inplace=True)  # Remove duplicates
data['City'] = data['City'].str.lower().str.strip()  # Standardize text formatting

# Handle outliers in 'Sales' column
q1 = data['Sales'].quantile(0.25)
q3 = data['Sales'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
data['Sales'] = np.where(data['Sales'] > upper_bound, upper_bound, data['Sales'])
data['Sales'] = np.where(data['Sales'] < lower_bound, lower_bound, data['Sales'])

print("Data cleaning complete.")
data.head()


Data cleaning complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Sales'].fillna(data['Sales'].median(), inplace=True)  # Replace missing sales with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['City'].fillna('Unknown', inplace=True)  # Replace missing cities with 'Unknown'


Unnamed: 0,Category_Technology,Category_Office Supplies,Category_Furniture,City,Sales
0,0,0,1,washington,832.698188
1,1,0,0,houston,832.584084
2,0,0,1,los angeles,793.030324
3,0,0,1,phoenix,876.387212
4,0,1,0,new york,847.019964


In [None]:

# Feature Engineering
data['Total_Categories'] = data[['Category_Technology', 'Category_Office Supplies', 'Category_Furniture']].sum(axis=1)
data = pd.get_dummies(data, columns=['City'], drop_first=True)
print("Feature engineering complete.")
data.head()


Feature engineering complete.


Unnamed: 0,Category_Technology,Category_Office Supplies,Category_Furniture,Sales,Total_Categories,City_charlotte,City_chicago,City_columbus,City_dallas,City_denver,...,City_new york,City_philadelphia,City_phoenix,City_san antonio,City_san diego,City_san francisco,City_san jose,City_seattle,City_unknown,City_washington
0,0,0,1,832.698188,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,1,0,0,832.584084,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,0,1,793.030324,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,0,1,876.387212,1,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,0,1,0,847.019964,1,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False




In [None]:

# Splitting data into features and target
X = data.drop(columns=['Sales'])
y = data['Sales']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

# Training and evaluation
results = {}
predictions = {}
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    predictions[model_name] = y_pred
    results[model_name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

print("Model Evaluation Metrics:")
for model_name, metrics in results.items():
    print(f"{model_name}: RMSE={metrics['RMSE']}, MAE={metrics['MAE']}, R2 Score={metrics['R2 Score']}")


Model Evaluation Metrics:
Linear Regression: RMSE=52.13041181030683, MAE=42.38686537252008, R2 Score=0.5456098038556614
Random Forest: RMSE=52.43452737566741, MAE=42.56413891526253, R2 Score=0.5402927460817719
Gradient Boosting: RMSE=53.14132057415051, MAE=43.02424038346773, R2 Score=0.5278159355253216
XGBoost: RMSE=52.54837419174375, MAE=42.6826985130215, R2 Score=0.5382943291424471


In [None]:

# Ensemble Models

# Averaging ensemble
linear_pred = predictions["Linear Regression"]
random_forest_pred = predictions["Random Forest"]
gradient_boosting_pred = predictions["Gradient Boosting"]
xgboost_pred = predictions["XGBoost"]
ensemble_pred_avg = (linear_pred + random_forest_pred + gradient_boosting_pred + xgboost_pred) / 4

# Evaluate averaging ensemble
rmse_avg = np.sqrt(mean_squared_error(y_test, ensemble_pred_avg))
mae_avg = mean_absolute_error(y_test, ensemble_pred_avg)
r2_avg = r2_score(y_test, ensemble_pred_avg)
print(f"Averaging Ensemble -> RMSE: {rmse_avg}, MAE: {mae_avg}, R²: {r2_avg}")

# Stacking ensemble
stacked_features = np.column_stack((linear_pred, random_forest_pred, gradient_boosting_pred, xgboost_pred))
meta_model = LinearRegression()
meta_model.fit(stacked_features, y_test)
ensemble_pred_stacked = meta_model.predict(stacked_features)

# Evaluate stacking ensemble
rmse_stacked = np.sqrt(mean_squared_error(y_test, ensemble_pred_stacked))
mae_stacked = mean_absolute_error(y_test, ensemble_pred_stacked)
r2_stacked = r2_score(y_test, ensemble_pred_stacked)
print(f"Stacked Ensemble -> RMSE: {rmse_stacked}, MAE: {mae_stacked}, R²: {r2_stacked}")


Averaging Ensemble -> RMSE: 52.31232290316404, MAE: 42.50454230036928, R²: 0.5424330463924762
Stacked Ensemble -> RMSE: 52.074255532425546, MAE: 42.4025180757458, R²: 0.5465882391877996


In [None]:
!pip install streamlit pyngrok




In [None]:
import joblib
joblib.dump(meta_model, "meta_model.pkl")

['meta_model.pkl']

In [None]:
!pip install streamlit pyngrok




In [None]:
with open("app.py", "w") as f:
    f.write("""
import streamlit as st
import numpy as np
import joblib

# Load the meta-model and scaler
meta_model = joblib.load("meta_model.pkl")
scaler = joblib.load("scaler.pkl")

# Feature input options
cities = [
    "charlotte", "chicago", "columbus", "dallas", "denver", "fort worth",
    "houston", "indianapolis", "jacksonville", "los angeles", "new york",
    "philadelphia", "phoenix", "san antonio", "san diego"
]

categories = [0, 1]  # Options for binary values

# Streamlit app
st.title("Sales Forecasting App")
st.header("Enter the Features")

# User input fields
Category_Technology = st.selectbox("Category: Technology (1 for Yes, 0 for No)", categories)
Category_Office_Supplies = st.selectbox("Category: Office Supplies (1 for Yes, 0 for No)", categories)
Category_Furniture = st.selectbox("Category: Furniture (1 for Yes, 0 for No)", categories)
Total_Categories = st.number_input("Total Categories (e.g., 1, 2, 3)", min_value=1, step=1)
City = st.selectbox("City", cities)

# Predict button
if st.button("Predict Sales"):
    # Prepare input features
    city_features = [1 if c == City else 0 for c in cities]
    input_features = np.array([[Category_Technology, Category_Office_Supplies, Category_Furniture, Total_Categories] + city_features])

    # Debugging: Print input feature shape
    st.write(f"Input features shape: {input_features.shape}")

    # Scale the features
    scaled_features = scaler.transform(input_features)

    # Make prediction using the meta-model
    predicted_sales = meta_model.predict(scaled_features)[0]

    # Display the result
    st.success(f"Predicted Sales: ${round(predicted_sales, 2)}")
""")


In [None]:
!ngrok authtoken 2pe98ltrGddUEwciPJGU6STpflZ_7wzs5F9LMLgckAWvWwFvE

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!streamlit run app.py &>/dev/null&


In [None]:
from pyngrok import ngrok

# Expose Streamlit port 8501
public_url = ngrok.connect(addr="8501", proto="http")
print(f"Streamlit app is running at {public_url}")



PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: The authtoken credential '2pe4gQI0X6QxY2yKWkKRHuS0ptT' has been revoked\nand is no longer valid.\r\n\r\nERR_NGROK_300\r\n"}}


In [None]:
print(f"Input feature shape: {input_features.shape}")


NameError: name 'input_features' is not defined