In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from flask import Flask, request, jsonify
import joblib
import logging

# Set up logging
logging.basicConfig(filename='sales_forecast.log', level=logging.INFO)

# Load the dataset
train_data = pd.read_csv("train.csv")  #
test_data = pd.read_csv("test.csv")

# Task 1 - Exploratory Data Analysis (EDA)
def exploratory_data_analysis(train_data):
    # Handling missing values
    train_data.fillna(0, inplace=True)
    
    # Check distribution of promotions
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Promo', data=train_data)
    plt.title('Distribution of Promotions in Train Data')
    plt.show()

    # Correlation between sales and customers
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Customers', y='Sales', data=train_data)
    plt.title('Sales vs Customers')
    plt.show()

    # Sales during holidays
    holidays = train_data[train_data['StateHoliday'] != '0']
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='StateHoliday', y='Sales', data=holidays)
    plt.title('Sales During Holidays')
    plt.show()

# Run the EDA
exploratory_data_analysis(train_data)

# Task 2 - Data Preprocessing
def preprocess_data(df):
    # Feature engineering from date
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    
    # Convert categorical columns to numeric using one-hot encoding
    df = pd.get_dummies(df, columns=['StoreType', 'Assortment', 'StateHoliday'], drop_first=True)
    
    # Fill missing competition data
    df['CompetitionDistance'].fillna(df['CompetitionDistance'].mean(), inplace=True)
    
    return df

# Preprocess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Task 2.1 - Scaling Data
scaler = StandardScaler()
X = train_data.drop(columns=['Sales', 'Date'])
y = train_data['Sales']

X_scaled = scaler.fit_transform(X)

# Task 2.2 - Build RandomForest Model with Sklearn Pipelines
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Task 2.3 - Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Random Forest MSE: {mse}")

# Task 2.6 - Build Deep Learning Model (LSTM)
def create_lstm_model(input_shape):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(tf.keras.layers.LSTM(50))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Preparing data for LSTM
X_lstm = train_data[['Year', 'Month', 'Day', 'Promo', 'Customers']].values
X_lstm_scaled = scaler.fit_transform(X_lstm)
y_lstm = train_data['Sales'].values.reshape(-1, 1)

X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test = train_test_split(X_lstm_scaled, y_lstm, test_size=0.2, random_state=42)
X_lstm_train = np.reshape(X_lstm_train, (X_lstm_train.shape[0], 1, X_lstm_train.shape[1]))
X_lstm_test = np.reshape(X_lstm_test, (X_lstm_test.shape[0], 1, X_lstm_test.shape[1]))

lstm_model = create_lstm_model((X_lstm_train.shape[1], X_lstm_train.shape[2]))
lstm_model.fit(X_lstm_train, y_lstm_train, epochs=10, batch_size=64, validation_data=(X_lstm_test, y_lstm_test))

# Save the LSTM model
lstm_model.save('lstm_model.h5')

# Task 3 - Create REST API using Flask
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    # Load the RandomForest model
    model = joblib.load('random_forest_model.pkl')
    
    # Get input data
    input_data = request.json
    
    # Convert data to pandas DataFrame
    input_df = pd.DataFrame([input_data])
    input_scaled = scaler.transform(input_df)
    
    # Make prediction
    prediction = model.predict(input_scaled)
    
    return jsonify({'prediction': prediction[0]})

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)
