In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


file_path = "Day_14_Pharma_data.csv"
df = pd.read_csv(file_path)

# 1. Data Cleaning
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values, "\n")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}\n")

# Drop rows with missing values or handle them as necessary
df = df.dropna()

# Check the column names
print("Column names in the dataset:")
print(df.columns)

# Display the first 5 rows to understand the data structure
print("First 5 rows of the dataset:")
print(df.head())

# 2. Create Visualizations

# (a) Bar plot comparing the average Effectiveness for each drug across different regions
# Replace 'Product' with the correct column name if necessary (e.g., 'Drug')
average_effectiveness_per_region = df.groupby(["Region", "Product"])["Effectiveness"].mean().unstack()
average_effectiveness_per_region.plot(kind="bar", figsize=(10, 6), color=["lightblue", "lightgreen", "lightcoral", "orange"])
plt.title("Average Effectiveness of Drugs Across Different Regions")
plt.xlabel("Region")
plt.ylabel("Average Effectiveness")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# (b) Violin plot showing the distribution of Effectiveness and Side_Effects for each product
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x="Product", y="Effectiveness", hue="Side_Effects", split=True, palette="Set2")
plt.title("Effectiveness and Side Effects Distribution by Product")
plt.xlabel("Product")
plt.ylabel("Effectiveness")
plt.tight_layout()
plt.show()

# (c) Pairplot to explore relationships between Effectiveness, Side_Effects, and Marketing_Spend
sns.pairplot(df[["Effectiveness", "Side_Effects", "Marketing_Spend"]], hue="Product", palette="Set1")
plt.suptitle("Pairplot: Effectiveness, Side Effects, and Marketing Spend", y=1.02)
plt.tight_layout()
plt.show()

# (d) Boxplot comparing Effectiveness for different trial periods
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="Trial_Period", y="Effectiveness", palette="Set3")
plt.title("Effectiveness Comparison Across Different Trial Periods")
plt.xlabel("Trial Period")
plt.ylabel("Effectiveness")
plt.tight_layout()
plt.show()

# (e) Regression plot to analyze how Marketing_Spend affects drug Effectiveness
plt.figure(figsize=(8, 6))
sns.regplot(data=df, x="Marketing_Spend", y="Effectiveness", scatter_kws={'color': 'blue'}, line_kws={'color': 'red'})
plt.title("Marketing Spend vs Drug Effectiveness")
plt.xlabel("Marketing Spend")
plt.ylabel("Effectiveness")
plt.tight_layout()
plt.show()


Missing values per column:
Product_ID         0
Product_Name       0
Region             0
Marketing_Spend    0
Sales              0
Effectiveness      0
Side_Effects       0
Age_Group          0
Trial_Period       0
dtype: int64 

Number of duplicate rows: 0

Column names in the dataset:
Index(['Product_ID', 'Product_Name', 'Region', 'Marketing_Spend', 'Sales',
       'Effectiveness', 'Side_Effects', 'Age_Group', 'Trial_Period'],
      dtype='object')
First 5 rows of the dataset:
   Product_ID Product_Name         Region  Marketing_Spend   Sales  \
0         101   PainRelief  North America            50000  100000   
1         102     ColdCure         Europe            30000   60000   
2         103  HeadacheFix           Asia            45000   80000   
3         104   PainRelief  North America            52000  110000   
4         105     ColdCure         Europe            28000   55000   

   Effectiveness  Side_Effects Age_Group Trial_Period  
0             85             5     20-

KeyError: 'Product'