# Modeling

### Purpose: Build, train, and tune machine learning models. 
### Apply Feature engineering (PCA, preprocessing/scaling, categorical encoding, binning/clustering)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Load the clean data
df_model = pd.read_csv('..//data/pricingdata_category.csv')

In [3]:
#identify categorical columns
categorical_columns = df_model.select_dtypes(include=['object']).columns
categorical_columns

Index(['id', 'prices.availability', 'prices.condition', 'prices.currency',
       'prices.dateSeen', 'prices.merchant', 'prices.shipping', 'brand',
       'categories', 'dateAdded', 'dateUpdated', 'name', 'primaryCategories'],
      dtype='object')

In [4]:
df_model.head(5)

Unnamed: 0,id,prices.availability,prices.condition,prices.currency,prices.dateSeen,prices.isSale,prices.merchant,prices.shipping,brand,categories,...,DVDBurners,Ipods&Mp3Players,SeemoreSamsungUN24H4500A24\\720pHDLEDLCDTelevis...,SeemoreUltimateEarsBOOM2PhantomPortableSpeaker...,SeemoreSamsungLevelUProActiveNoiseCancellingWi...,SeemoreJBLGX9633-Way6in.x9in.Speaker,BusinessLaptopPCs,SoundReinforcementMixers,USBMixers,DJMixers
0,AVphzgbJLJeJML43fA0o,In Stock,New,USD,2018-05-26 15:00:00,False,Bestbuy.com,unknown,Sanus,"Audio&VideoAccessories,TVMounts,TVAccessories&...",...,0,0,0,0,0,0,0,0,0,0
1,AVpgMuGwLJeJML43KY_c,In Stock,New,USD,2017-12-14 06:00:00,True,Walmart.com,Expedited,Boytone,"Stereos,PortableBluetoothSpeakers,TV,Video&Hom...",...,0,0,0,0,0,0,0,0,0,0
2,AVpgMuGwLJeJML43KY_c,In Stock,New,USD,2017-09-08 05:00:00,False,Walmart.com,Expedited,Boytone,"Stereos,PortableBluetoothSpeakers,TV,Video&Hom...",...,0,0,0,0,0,0,0,0,0,0
3,AVpgMuGwLJeJML43KY_c,In Stock,New,USD,2017-10-10 05:00:00,False,Bestbuy.com,unknown,Boytone,"Stereos,PortableBluetoothSpeakers,TV,Video&Hom...",...,0,0,0,0,0,0,0,0,0,0
4,AVpgMuGwLJeJML43KY_c,In Stock,New,USD,2017-08-28 07:00:00,False,Bestbuy.com,unknown,Boytone,"Stereos,PortableBluetoothSpeakers,TV,Video&Hom...",...,0,0,0,0,0,0,0,0,0,0


In [5]:
#One hot encoding
df_model = pd.get_dummies(df_model, columns=['id', 'prices.availability', 'prices.condition', 'prices.currency',
       'prices.dateSeen', 'prices.merchant', 'prices.shipping', 'brand',
       'categories', 'dateAdded', 'dateUpdated', 'name', 'primaryCategories'])


In [6]:
df_model.head(5)

Unnamed: 0,prices.isSale,weight_pounds,prices.amountAvg,Audio&VideoAccessories,TVMounts,TVAccessories&Parts,Electronics,A/VPresentation,Accessories&Supplies,TVCeiling&WallMounts,...,name_mophie powerstation Plus Mini External Battery with Built in Cables for Smartphones and Tablets (4,name_myCharge - RAZOR PLUS USB Portable Power Bank - Silver,name_naxa - 2.1-Channel Soundbar with 50-Watt Digital Amplifier - Black,name_sub6 100W 6 Active Subwoofer (Matte Gray),primaryCategories_ Apple CarPlay,primaryCategories_ Intel Celeron,primaryCategories_ Siri Eyes Free,primaryCategories_Electronics,"primaryCategories_Electronics,Furniture","primaryCategories_Electronics,Media"
0,False,32.8,104.99,1,1,1,1,1,1,1,...,False,False,False,False,False,False,False,True,False,False
1,True,14.0,66.995,0,0,0,1,0,0,0,...,False,False,False,False,False,False,False,True,False,False
2,False,14.0,69.0,0,0,0,1,0,0,0,...,False,False,False,False,False,False,False,True,False,False
3,False,14.0,69.99,0,0,0,1,0,0,0,...,False,False,False,False,False,False,False,True,False,False
4,False,14.0,66.99,0,0,0,1,0,0,0,...,False,False,False,False,False,False,False,True,False,False


In [7]:
#Feature generation of time difference between dateAdded and dateUpdated


In [8]:
#Target and features
target = 'prices.amountAvg'
features = df_model.columns.drop(target)

In [9]:
#Split the data into train and test 80/20
X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
#Normalization
normalizer = MinMaxScaler()
normalizer.fit(X_train)


In [11]:
#Standardization
scaler = StandardScaler()
scaler.fit(X_train)

In [12]:
#apply PCA to reduce the number of features
pca = PCA(n_components=10)
pca.fit(X_train)


In [13]:
#Explained variance ratio of the PCA components 
explained_variance = pca.explained_variance_ratio_
explained_variance 

array([0.92394657, 0.00599878, 0.00385122, 0.00249657, 0.00205518,
       0.00175194, 0.00143397, 0.0012728 , 0.00114149, 0.00108141])

In [14]:
cumulative_variance = np.cumsum(explained_variance)
cumulative_variance  



array([0.92394657, 0.92994535, 0.93379656, 0.93629314, 0.93834832,
       0.94010026, 0.94153423, 0.94280703, 0.94394851, 0.94502992])

In [15]:
# Identifying the number of components that explain at least 95% of the variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1

In [16]:
#Transforming our training and testing data
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [17]:
X_train_norm

array([[0.        , 0.04297975, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.01314879, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.02491349, 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03051903, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.0650519 , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.11211073, 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [18]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_train_norm.head()

Unnamed: 0,prices.isSale,weight_pounds,Audio&VideoAccessories,TVMounts,TVAccessories&Parts,Electronics,A/VPresentation,Accessories&Supplies,TVCeiling&WallMounts,Stereos,...,name_mophie powerstation Plus Mini External Battery with Built in Cables for Smartphones and Tablets (4,name_myCharge - RAZOR PLUS USB Portable Power Bank - Silver,name_naxa - 2.1-Channel Soundbar with 50-Watt Digital Amplifier - Black,name_sub6 100W 6 Active Subwoofer (Matte Gray),primaryCategories_ Apple CarPlay,primaryCategories_ Intel Celeron,primaryCategories_ Siri Eyes Free,primaryCategories_Electronics,"primaryCategories_Electronics,Furniture","primaryCategories_Electronics,Media"
0,0.0,0.04298,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.013149,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.024913,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.052941,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.002111,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
#Liner regression
#df_model = LinearRegression()
#df_model.fit(X_train_norm, y_train)

# Predictions
#y_pred = df_model.predict(X_test)

# Evaluation
#print('MSE:', mean_squared_error(y_test, y_pred))
#print('R-squared:', r2_score(y_test, y_pred))


MSE: 3.3219540959633825e+31
R-squared: -4.752327710346715e+25


In [20]:
#decision tree
# Initialize and train the model
#df_model = DecisionTreeRegressor(random_state=42)
#df_model.fit(X_train, y_train)

# Predictions and evaluation
#y_pred = df_model.predict(X_test)
#print('MSE:', mean_squared_error(y_test, y_pred))
#print('R-squared:', r2_score(y_test, y_pred))

MSE: 124941.56918796523
R-squared: 0.8212608409768003


In [25]:
#random forest
# Initialize and train the model
df_model = RandomForestRegressor(n_estimators=10, random_state=42)
df_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = df_model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

# Define a reduced parameter grid
#param_grid_reduced = {
    'n_estimators': [int(x) for x in range(100, 600, 100)],  # Fewer values to consider
    'max_features': ['auto', 'sqrt'],  # Keep this the same
    'max_depth': [int(x) for x in range(10, 50, 10)],  # Smaller range
    'min_samples_split': [2, 5],  # Fewer options
    'min_samples_leaf': [1, 2],  # Fewer options
    'bootstrap': [True, False]  # Keep this the same
}

# Setup the randomized search with reduced iterations
#random_search_reduced = RandomizedSearchCV(
    estimator=df_model,
    param_distributions=param_grid_reduced,
    n_iter=10,  # Reduced number of iterations
    cv=3,  # Keep the same cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Perform the randomized search
#random_search.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
#print(f'Best parameters found: {random_search.best_params_}')

IndentationError: unexpected indent (212557562.py, line 13)

In [None]:
#GBM
df_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
df_model.fit(X_train_norm, y_train)

# Predictions and evaluation
y_pred = df_model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

MSE: 567392.6383147371
R-squared: 0.18829830882179066


In [None]:
#SVR
df_model = SVR()
df_model.fit(X_train_norm, y_train)

# Predictions and evaluation
y_pred = df_model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

MSE: 794928.6394119529
R-squared: -0.13721059704470928


In [None]:
#KNN
df_model = KNeighborsRegressor()
df_model.fit(X_train_norm, y_train)

# Predictions and evaluation
y_pred = df_model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

MSE: 3224980.8177797445
R-squared: -3.61359948480162
