In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime

In [None]:
# Define constants
TEST_SIZE = 0.3
RANDOM_STATE = 42

In [None]:
# Load data
def load_data(file_path):
    return pd.read_csv(file_path, encoding='latin-1')

train_data = load_data("train2.csv")
test_data = load_data("test2.csv")

In [None]:
# Combine data
combined_data = pd.concat([train_data, test_data])

In [None]:
# Reset index
combined_data.reset_index(drop=True, inplace=True)

In [None]:
import pandas as pd

# Load your datasets
train_data = pd.read_csv("train2.csv", encoding='latin-1')
test_data = pd.read_csv("test2.csv", encoding='latin-1')

# Combine the datasets
combined_data = pd.concat([train_data, test_data])

# Display the columns in the dataset
print("Columns in the dataset:")
print(combined_data.columns)

# Identify unique entities based on the columns
entities = set(combined_data.columns)  # Assuming each column represents an entity

# Display the identified entities
print("\nIdentified entities in the dataset:")
for entity in entities:
    print(entity)


In [None]:
# Display combined data
print(combined_data.head())

In [None]:
# Generate summary statistics
print(combined_data.describe())

In [None]:
# Check for missing values
print(combined_data.isnull().sum())

In [None]:
# Clean data
def clean_data(data):
    return data.dropna()

combined_data_cleaned = clean_data(combined_data)

In [None]:
# Display cleaned data
print("Cleaned dataset:")
print(combined_data_cleaned)

In [None]:
# Analyze data
def analyze_data(data):
    print(data.info())
    print(data.columns)
    print(data.dtypes)

analyze_data(combined_data_cleaned)

In [None]:
# Preprocess data
def preprocess_data(data):
    numeric_columns = data.select_dtypes(include=['int', 'float']).columns
    x = data[numeric_columns].drop('Total', axis=1)
    y = data['Total']
    return x, y

x, y = preprocess_data(combined_data_cleaned)

In [None]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [None]:
# Print shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Train models
def train_model(x_train, y_train, model):
    model.fit(x_train, y_train)
    return model

linear_model = train_model(x_train, y_train, LinearRegression())
random_forest_model = train_model(x_train, y_train, RandomForestRegressor(n_estimators=100, n_jobs=-1))


In [None]:
# Evaluate models
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

mse_linear, r2_linear = evaluate_model(linear_model, x_test, y_test)
mse_rf, r2_rf = evaluate_model(random_forest_model, x_test, y_test)


In [None]:
print("Linear Regression:")
print("RMSE:", np.sqrt(mse_linear))
print("Variance score:", r2_linear)

In [None]:
print("Random Forest Regression:")
print("RMSE:", np.sqrt(mse_rf))
print("Variance score:", r2_rf)

In [None]:
# Add graph visualization of products
def visualize_products(data):
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Product Line', data=data)
    plt.title('Product Distribution')
    plt.xlabel('Product')
    plt.ylabel('Count')
    plt.tight_layout()  # Add this to ensure the plot fits in the figure
    plt.show()

visualize_products(combined_data_cleaned)

In [None]:
# Add scatter plot of Total vs other features
def visualize_total_vs_features(data):
    numeric_columns = data.select_dtypes(include=['int', 'float']).columns
    for col in numeric_columns:
        if col != 'Total':
            plt.figure(figsize=(8, 6))
            plt.scatter(data[col], data['Total'])
            plt.title(f'Total vs {col}')
            plt.xlabel(col)
            plt.ylabel('Total')
            plt.tight_layout() 
            plt.show()

visualize_total_vs_features(combined_data_cleaned)

In [None]:
# Calculate total sales
def calculate_top_20_products(data):
    total_sales = data.groupby('Product Line')['Total'].sum()
    top_20_products = total_sales.sort_values(ascending=False).head(20)
    return top_20_products

top_20_products = calculate_top_20_products(combined_data_cleaned)
print("Top 20 Selling Products:")
print(top_20_products)

In [None]:
# Visualize top 20 selling products
def visualize_top_20_products(top_20_products):
    plt.figure(figsize=(12, 8))
    top_20_products.plot(kind='bar')
    plt.title('Top 20 Selling Products')
    plt.xlabel('Product Line')
    plt.ylabel('Total Sales')
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
    plt.tight_layout()
    plt.show()

visualize_top_20_products(top_20_products)

# new dataset for prediction


In [None]:
import pandas as pd
import numpy as np

# Define the number of samples
num_samples = 200

# Generate random dates within a specific range
start_date = '2024-01-01'
end_date = '2024-12-31'
dates = pd.date_range(start=start_date, end=end_date, periods=num_samples)

# Generate random data for other columns
invoice_ids = np.random.randint(1000, 9999, size=num_samples)
product_lines = np.random.choice(['Product A', 'Product B', 'Product C'], size=num_samples)
item_types = np.random.choice(['Type X', 'Type Y', 'Type Z'], size=num_samples)
base_units = np.random.choice(['Unit A', 'Unit B', 'Unit C'], size=num_samples)
quantities = np.random.randint(1, 100, size=num_samples)
unit_prices = np.random.uniform(10, 100, size=num_samples)
tax_rates = 0.05  # Assuming a constant tax rate of 5%
cogs = quantities * unit_prices
taxes = cogs * tax_rates
totals = cogs + taxes
divisions = np.random.choice(['Division 1', 'Division 2', 'Division 3'], size=num_samples)

# Create DataFrame
prediction_data = pd.DataFrame({
    'Date': dates,
    'Invoice ID': invoice_ids,
    'Product Line': product_lines,
    'Item_Type': item_types,
    'BASEUNIT': base_units,
    'Quantity': quantities,
    'Unit price': unit_prices,
    'Tax 5%': taxes,
    'cogs': cogs,
    'Total': totals,
    'DIVISION': divisions
})

# Print the first few rows of the dataframe
print(prediction_data.head())


In [None]:
# Load and preprocess new prediction data
def load_and_preprocess_new_data(file_path):
    new_data = load_data(file_path)  # Load new data
    x_new, _ = preprocess_data(new_data)  # Preprocess new data
    return x_new, new_data['Date']  # Return preprocessed data and dates

# Define file path for new prediction data
new_data_file_path = ("prediction_data.csv")

# Load and preprocess new data
x_new, dates = load_and_preprocess_new_data(new_data_file_path)

# Make predictions using trained models
linear_predictions = linear_model.predict(x_new)
rf_predictions = random_forest_model.predict(x_new)

# Print predictions with dates and expected amount of sales
print("Predictions for Total Sales:")
print("Date\t\t\tLinear Regression\tRandom Forest")
for date, linear_pred, rf_pred in zip(dates, linear_predictions, rf_predictions):
    print(f"{date}\t${linear_pred:.2f}\t\t\t${rf_pred:.2f}")
