In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('advertising_and_sales_clean.csv')

# Display basic info
print('Dataset Info:')
print(df.info())
print('
First 5 rows:')
print(df.head())

# Descriptive statistics
print('
Descriptive Statistics:')
print(df.describe())

# Check for missing values
print('
Missing Values:')
print(df.isnull().sum())

# Correlation matrix
corr = df.corr(numeric_only=True)
print('
Correlation Matrix:')
print(corr)
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Distribution plots
numerical_cols = ['tv', 'radio', 'social_media', 'sales']
for col in numerical_cols:
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

# Boxplot by influencer type
sns.boxplot(x='influencer', y='sales', data=df)
plt.title('Sales Distribution by Influencer Type')
plt.show()

# Average sales per influencer category
print('
Average Sales by Influencer Type:')
print(df.groupby('influencer')['sales'].mean())

# Linear Regression Model
X = df[['tv', 'radio', 'social_media']]
y = df['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print('
Linear Regression Evaluation:')
print('R2 Score:', r2_score(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))

# Coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print('
Feature Coefficients:')
print(coefficients)