In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')


In [2]:
# Function to downcast data types
def downcast(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype == "int64"]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    return df

In [6]:
# Preprocessing
train_df.fillna('', inplace=True)
test_df.fillna('', inplace=True)

label_encoder = LabelEncoder()
train_df['disease_type'] = label_encoder.fit_transform(train_df['disease_type'])
test_df['disease_type'] = label_encoder.fit_transform(test_df['disease_type'])

train_df['launch_date'] = pd.to_datetime(train_df['launch_date'])
test_df['launch_date'] = pd.to_datetime(test_df['launch_date'])

train_df['launch_year'] = train_df['launch_date'].dt.year
train_df['launch_month'] = train_df['launch_date'].dt.month
train_df['launch_day'] = train_df['launch_date'].dt.day

test_df['launch_year'] = test_df['launch_date'].dt.year
test_df['launch_month'] = test_df['launch_date'].dt.month
test_df['launch_day'] = test_df['launch_date'].dt.day

train_df.drop('launch_date', axis=1, inplace=True)
test_df.drop('launch_date', axis=1, inplace=True)

train_df = downcast(train_df)
test_df = downcast(test_df)


  train_df['launch_date'] = pd.to_datetime(train_df['launch_date'])
  test_df['launch_date'] = pd.to_datetime(test_df['launch_date'])


In [7]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=500)
tfidf_train = tfidf.fit_transform(train_df['medicine_review'])
tfidf_test = tfidf.transform(test_df['medicine_review'])

tfidf_train_df = pd.DataFrame.sparse.from_spmatrix(tfidf_train, columns=tfidf.get_feature_names_out())
tfidf_test_df = pd.DataFrame.sparse.from_spmatrix(tfidf_test, columns=tfidf.get_feature_names_out())

train_df = pd.concat([train_df.reset_index(drop=True), tfidf_train_df.reset_index(drop=True)], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), tfidf_test_df.reset_index(drop=True)], axis=1)

train_df.drop('medicine_review', axis=1, inplace=True)
test_df.drop('medicine_review', axis=1, inplace=True)


In [8]:
# Training and Evaluation
X = train_df.drop(['score', 'medicine_no'], axis=1)
y = train_df['score']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)





Validation RMSE: 29.28999769239076, Score: 70.71000230760924




In [9]:
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
score = max(0, 100 - rmse)
print(f'Validation RMSE: {rmse}, Score: {score}')

Validation RMSE: 29.28999769239076, Score: 70.71000230760924




In [10]:
# Predictions and Submission
X_test = test_df.drop(['medicine_no'], axis=1)
test_preds = model.predict(X_test)

submission_df = pd.DataFrame({'medicine_no': test_df['medicine_no'], 'score': test_preds})
submission_df.to_csv('submission.csv', index=False)


