<a href="https://colab.research.google.com/github/des-afari/Price-Optimization-Model/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pandas as pd
import spacy

In [64]:
df = pd.read_csv('dataset.csv')
nlp = spacy.load('en_core_web_sm')

In [103]:
# using SpaCy for tokenization
# Requirements
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [66]:
df.head()

Unnamed: 0,Brand,Title,Selling Price,Price,Discount
0,HRX by Hrithik Roshan,Printed Round Neck T-Shirt,664,699,5%
1,HERE&NOW,Solid Polo T-shirt,399,799,50%
2,Moda Rapido,Colourblocked Round Neck T-shirt,454,699,35%
3,Roadster,Camouflage Round Neck T-shirt,454,699,35%
4,DILLINGER,Colourblocked Round Neck Navy Blue T-shirt,449,899,50%


# Brand

In [67]:
df['Brand'] =  df['Brand'].str.lower()

# Selling Price and Original Price

In [68]:
df['Selling Price'] = df['Selling Price'].astype('str')
df = df[df['Selling Price'].str.len() <= 6] # removing unwanted data
df = df[df['Price'].str.len() <= 6] # removing unwanted data

In [69]:
df['Selling Price'] = df['Selling Price'].astype('float')
df['Price'] = df['Price'].astype('float')

In [70]:
# converting price from rupees to cedis
def price_conversion(x):
  return round(x * 0.1372, 2)

In [71]:
df['Selling Price'] = df['Selling Price'].apply(price_conversion)
df['Price'] = df['Price'].apply(price_conversion)

In [72]:
df['Title'] = df.Title.str.replace('T-shirt', '') # removing stopwords
df['Title'] = df.Title.str.replace('T-Shirt', '') # removing stopwords

# Discount

In [73]:
df['Discount'] = df['Discount'].str.replace('%', '').astype('float').div(100, axis=0)

# Description

In [74]:
def preprocess_text(text):
  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if not token.is_stop]
  return " ".join(tokens)

In [76]:
# Apply text preprocessing to your DataFrame
df['preprocessed_description'] = df['Title'].apply(preprocess_text)

# Renaming all columns

In [77]:
df.rename(columns={'Selling Price': 'Selling_Price(GHS)', 'Price': 'Original_Price(GHS)', 'preprocessed_description': 'Description'}, inplace=True)

# Droping Title column

In [78]:
df.drop('Title', axis=1, inplace=True)

# Final Table

In [79]:
df.head()

Unnamed: 0,Brand,Selling_Price(GHS),Original_Price(GHS),Discount,Description
0,hrx by hrithik roshan,91.1,95.9,0.05,Printed Round Neck
1,here&now,54.74,109.62,0.5,solid Polo
2,moda rapido,62.29,95.9,0.35,Colourblocked Round Neck
3,roadster,62.29,95.9,0.35,Camouflage Round Neck
4,dillinger,61.6,123.34,0.5,Colourblocked Round Neck Navy Blue


# Random Forest

In [115]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Convert String Columns to Numbers

In [108]:
df['Brand'] = pd.Categorical(df['Brand']).codes
df['Description'] = pd.Categorical(df['Description']).codes

In [109]:
X = df.drop('Selling_Price(GHS)', axis=1)
y = df['Selling_Price(GHS)']

# Training and Test split

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Classifier Model

In [111]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [112]:
rf_model.fit(X_train, y_train)

In [113]:
y_pred = rf_model.predict(X_test)

# Evaluating Model Performance

### mean absolute error

In [116]:
mean_absolute_error(y_test, y_pred)

0.15620109817984448

### mean squared error

In [117]:
mean_squared_error(y_test, y_pred)

12.237112394541432

### r2 score

In [119]:
r2_score(y_test, y_pred)

0.9987197576981851