In [40]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.3/150.0 MB 8.4 MB/s eta 0:00:18
    --------------------------------------- 3.1/150.0 MB 8.4 MB/s eta 0:00:18
   - -------------------------------------- 6.6/150.0 MB 11.2 MB/s eta 0:00:13
   -- ------------------------------------- 8.7/150.0 MB 11.0 MB/s eta 0:00:13
   --- ------------------------------------ 11.3/150.0 MB 11.2 MB/s eta 0:00:13
   --- ------------------------------------ 13.6/150.0 MB 11.3 MB/s eta 0:00:13
   ---- ----------------------------------- 16.3/150.0 MB 11.4 MB/s eta 0:00:12
   ----- ---------------------------------- 18.9/150.0 MB 11.5 MB/s eta 0:00:12
   ----- ---------------------------------- 21.2/150.0 MB 11.5 MB/s eta 0:00:12
   ------ --------------------------------- 23.9/150.0 MB 11.5 MB/s

In [50]:
print(train.columns.tolist())


['Medicine_ID', 'Counterfeit_Weight', 'DistArea_ID', 'Active_Since', 'Medicine_MRP', 'Medicine_Type', 'SidEffect_Level', 'Availability_rating', 'Area_Type', 'Area_City_Type', 'Area_dist_level', 'Counterfeit_Sales']


In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

# Step 1: Load data
train = pd.read_csv("counterfeit_train (1).csv")
test = pd.read_csv("counterfeit_test.csv")

# Step 2: Fill missing values
train['Counterfeit_Weight'] = train['Counterfeit_Weight'].fillna(train['Counterfeit_Weight'].mean())
test['Counterfeit_Weight'] = test['Counterfeit_Weight'].fillna(test['Counterfeit_Weight'].mean())

# Step 3: Feature engineering
def engineer_features(df):
    df['Years_Active'] = 2025 - df['Active_Since']
    df['Availability_rating'] = pd.to_numeric(df['Availability_rating'], errors='coerce')
    df['SidEffect_Level'] = pd.to_numeric(df['SidEffect_Level'], errors='coerce')
    df['MRP_Availability_Index'] = df['Medicine_MRP'] * df['Availability_rating']
    df['MRP_SideEffect_Index'] = df['Medicine_MRP'] * df['SidEffect_Level']
    df['DistArea_Category_Code'] = df['DistArea_ID'].str.extract(r'(\d+)', expand=False).fillna(0).astype(int)
    return df

train = engineer_features(train)
test = engineer_features(test)

# Step 4: Encode categorical variables
cat_cols = [col for col in ['Area_Type', 'Area_City_Type', 'Medicine_Type'] if col in train.columns]
train_encoded = pd.get_dummies(train.drop(['Medicine_ID'], axis=1), columns=cat_cols)
test_encoded = pd.get_dummies(test.drop(['Medicine_ID'], axis=1), columns=cat_cols)

# Step 5: Align features
X = train_encoded.drop('Counterfeit_Sales', axis=1)
y = np.log1p(train_encoded['Counterfeit_Sales'])

X, X_test = X.align(test_encoded, join='left', axis=1, fill_value=0)
X = X.select_dtypes(include='number')
X_test = X_test.select_dtypes(include='number')
X = X.fillna(X.mean())
X_test = X_test.fillna(X.mean())

# Step 6: Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train model
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate
val_preds_log = xgb_model.predict(X_val)
val_preds = np.expm1(val_preds_log)
true_vals = np.expm1(y_val)

mae = mean_absolute_error(true_vals, val_preds)
score = 1 - (mae / 1660)

print("Final MAE:", mae)
print("Final Score (1 - MAE/1660):", score)

# Step 9: Train on full data
xgb_model.fit(X, y)
test_preds = np.expm1(xgb_model.predict(X_test))

# Step 10: Create submission file
submission = pd.DataFrame({
    'Medicine_ID': test['Medicine_ID'],
    'Counterfeit_Sales': test_preds
})
submission.to_csv("Md_Al_Emran_Attempt1.csv", index=False)
print("Submission file 'Md_Al_Emran_Attempt1.csv' created successfully.")


Final MAE: 780.6362558182971
Final Score (1 - MAE/1660): 0.5297371952901825
Submission file 'Md_Al_Emran_Attempt1.csv' created successfully.


In [67]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
