In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# **Import Libraries**

In [None]:
#Import Drive
from google.colab import drive
drive.mount('/content/drive')


# **Explore The Dataset**

In [None]:
df=pd.read_csv('/content/drive/MyDrive/sales_data.csv',encoding='latin-1')#Load the dataset
df.head()# Display the first few rows


In [None]:
df.shape

In [None]:
df.info()


In [None]:
df.describe()

# **Data Cleaning**

In [None]:
df.columns

In [None]:
# Check for duplicates
df.duplicated().sum()

In [None]:
# Remove duplicates
df.drop_duplicates()

In [None]:
# Handle missing values
df.isnull().sum()

In [None]:
# Filling missing numerical values with mean
df.fillna(df.select_dtypes(include=np.number).mean(), inplace=True)

In [None]:
#Filling missing numerical values with mode
df.fillna(df.mode().iloc[0], inplace=True)

In [None]:
# Convert Date column to datetime format
df['Sale_Date'] = pd.to_datetime(df['Sale_Date'])

In [None]:
# Plot sales trends over time
plt.figure(figsize=(10, 6))
df.groupby('Sale_Date')['Sales_Amount'].sum().plot(kind='line', color='blue')
plt.title('Sales trend over time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.grid(True)
plt.show()


In [None]:
df['Profit'] = (df['Unit_Price'] - df['Unit_Cost']) * df['Quantity_Sold']

In [None]:
# Scatter plot: Profit vs Discount
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Discount', y='Profit', data=df, color='orange')  # Pass df to data
plt.title('Profit vs Discount')
plt.xlabel('Discount')
plt.ylabel('Profit')
plt.grid(True)
plt.show()

In [None]:
# Heatmap for correlations
plt.figure(figsize=(8, 6))
numerical_df = df.select_dtypes(include=np.number)
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Select features and target
X = df[['Profit', 'Discount']]
y = df['Sales_Amount']

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [None]:
# Select features and target
X = df[['Unit_Cost', 'Unit_Price', 'Discount', 'Quantity_Sold']]
y = df['Sales_Amount']

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
#Evaluate the model
print("\nModel Evaluation:")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R-squared Score: {r2_score(y_test, y_pred):.2f}")

In [None]:
# Show coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print("\nModel Coefficients:")
print(coefficients)