In [2]:
import pandas as pd
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [3]:
# Step 1: Load the dataset
data = pd.read_csv('Sales_Data.csv')

In [4]:
# Step 2: Display first few rows and column names to understand the structure
print("Columns in the dataset:",data.columns)
print(data.head())request

Columns in the dataset: Index(['Product_Code', 'W0', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6', 'W7', 'W8',
       ...
       'Normalized 42', 'Normalized 43', 'Normalized 44', 'Normalized 45',
       'Normalized 46', 'Normalized 47', 'Normalized 48', 'Normalized 49',
       'Normalized 50', 'Normalized 51'],
      dtype='object', length=107)
  Product_Code  W0  W1  W2  W3  W4  W5  W6  W7  W8  ...  Normalized 42  \
0           P1  11  12  10   8  13  12  14  21   6  ...           0.06   
1           P2   7   6   3   2   7   1   6   3   3  ...           0.20   
2           P3   7  11   8   9  10   8   7  13  12  ...           0.27   
3           P4  12   8  13   5   9   6   9  13  13  ...           0.41   
4           P5   8   5  13  11   6   7   9  14   9  ...           0.27   

   Normalized 43  Normalized 44  Normalized 45  Normalized 46  Normalized 47  \
0           0.22           0.28           0.39           0.50           0.00   
1           0.40           0.50           0.10           

In [5]:
# Step 3: Strip spaces in column names to avoid any errors
data.columns = data.columns.str.strip()

In [6]:
# Step 4: Handle missing values by replacing with the mean of numeric columns
data.fillna(data.select_dtypes(include=['number']).mean(), inplace=True)

In [7]:
# Step 5: Inspect the dataset columns for normalized data
print("Columns in the dataset:", data.columns)

Columns in the dataset: Index(['Product_Code', 'W0', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6', 'W7', 'W8',
       ...
       'Normalized 42', 'Normalized 43', 'Normalized 44', 'Normalized 45',
       'Normalized 46', 'Normalized 47', 'Normalized 48', 'Normalized 49',
       'Normalized 50', 'Normalized 51'],
      dtype='object', length=107)


In [8]:
# Step 6: Check for relevant columns for creating 'High Sales' label
high_sales_created = False

In [9]:
# Check for one of the 'Normalized' columns (e.g., 'Normalized 1', 'Normalized 2', etc.)
if 'Normalized 1' in data.columns: # Replace this with the actual relevant column you choose
    threshold = data['Normalized 1'].mean()
    data['High_Sales'] = (data['Normalized 1']> threshold).astype(int)
    high_sales_created = True
elif 'Normalized 2' in data.columns:
    threshold = data['Normalized 2'].mean()
    data['High_Sales'] = (data['Normalized 2']> threshold).astype(int)
    high_sales_created = True
else:
    print("No normalized columns found for defining high sales.")
    exit()

In [10]:
# Step 7: Encode categorical variables only if they exist
if 'Product_Code' in data.columns:
    data['Product_Code'] = data['Product_Code'].astype('category').cat.codes

In [11]:
# Step 8: Define the features (X) and target (y) only if 'High Sales' was created
if high_sales_created:
    X = data.drop(columns=['High_Sales']) # Features
    y = data['High_Sales'] # Target

In [12]:
# Step 9: Split the data into training and testing sets (80% train, 20% test)
X_train,X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [13]:
# Step 10: Initialize and train the RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [14]:
# Step 11: Evaluate the model
y_pred = model.predict(X_test)

In [15]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100: 2f}%")

Accuracy:  100.000000%
