# Approach 2: XGBoost Regressor

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

# stops the warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'


df = pd.read_pickle('pickle_files/01_df.pkl')
df = df.drop(['region', 'state'], axis=1)
df = df.replace({'missing': pd.NA})

# Drop NA in modelled columns
df = df.dropna(subset=['condition', 'title_status'])

# Remove cars less than $1k
df = df[df['price'] >= 1000]

df = df.rename(columns={'odometer': 'miles'})

# Get approximate age of car in years
df['year'] = pd.to_datetime(df['year'], format='%Y')
df['car_age_years'] = (df['posting_date'] - df['year'])
df['car_age_years'] = df['car_age_years'].clip(pd.Timedelta('0 days'))
df['car_age_years'] /= pd.Timedelta('365 days')
df = df.drop(['posting_date', 'year'], axis=1)

df['car_age_years'].head()

def get_manufacturer(s: str) -> str:

    if 'Mazda' in s:
        return 'Mazda'
    elif 'Alfa Romeo' in s:
        return 'Alfa-Romeo'
    elif 'Range Rover' in s:
        return 'Land Rover'
    else:
        r = s.split()[0]
        return r if r != 'Mercur' else 'Mercury'

df['manufacturer'] = df['model'].map(get_manufacturer)
df['manufacturer'].value_counts().head()

def get_model(s: str) -> str:
    if 'Mazda' in s:
        return s
    elif 'Alfa Romeo' in s:
        return 'Stevio'
    elif 'Range Rover' in s:
        return 'Range Rover Sport'
    else: return ' '.join(s.split()[1:])

# Note no model name exists with 2 manufacturers
df['model'] = df['model'].map(get_model)
df['model'].value_counts().head()


# For each model, get most common type
# and change all other types to that type
for model in df['model'].unique():
    most_common_type = df[df['model']==model]['type'].value_counts().index[0]
    df.loc[df['model']==model, 'type'] = most_common_type


print(
    'number of model-type unique pairs: '
    f'{df[["model", "type"]].value_counts().shape[0]}'
)

# Drop single observation with 12 cylinders
df = df[df['cylinders'] != '12 cylinders']

# Drop <50 observations with 'parts only' title
df = df[df['title_status'] != 'parts only']

# Change na to 'nan'
df = df.fillna('nan')

df_raw = df

factor_vars = [
    'model',
    'manufacturer',
    'cylinders'
]

# Change column order
df = df[['miles', 'car_age_years']+factor_vars]


# Encode categorical variables
label_encoders = {}
for var in factor_vars:
    le = LabelEncoder().fit(df[var])
    df[var] = le.transform(df[var])
    label_encoders[var] = le
    
pickle.dump(label_encoders, open('pickle_files/ad_label_encoders.pkl', 'wb'))

train, test = train_test_split(df, test_size=0.20, random_state=0)

X_train = train.drop('miles', axis=1)
X_test = test.drop('miles', axis=1)

y_train = train['miles']
y_test = test['miles']

xgb_X_train = X_train
xgb_X_test = X_test

xgb_reg = XGBRegressor(
    n_estimators=300,
    max_depth=7,
    random_state=0,
    eval_metrics=mean_squared_error,
    verbosity=0
)
xgb_reg.fit(xgb_X_train, y_train)

xgb_X_train['predicted_mileage'] = xgb_reg.predict(xgb_X_train)
xgb_X_test['predicted_mileage'] = xgb_reg.predict(xgb_X_test)

df_pred = pd.concat([xgb_X_train, xgb_X_test], ignore_index=False)

df_pred = df_pred[['predicted_mileage']]

final_df = df.merge(df_pred, left_index=True, right_index=True)

final_df['anomaly'] = final_df['predicted_mileage'] > df['miles'] * 1.2

number of model-type unique pairs: 200


In [None]:
import pickle

In [None]:
pickle.dump(xgb_reg, open('pickle_files/ad_model.pkl', 'wb'))

In [None]:
with open('pickle_files/ad_model.pkl', 'rb') as model:
    ad_model = pickle.load(model)
    
with open('pickle_files/ad_label_encoders.pkl', 'rb') as model:
    ad_label_encoders = pickle.load(model)

In [None]:
def is_anomaly(x_input, encoders, ad_model):
    # x_input needs to be a dataframe and
    # have columns miles, car_age_years, model, manufacturer, cylinders
    actual = x_input['miles']
    x_input = x_input.iloc[:, 1:]
    
    factor_vars = [
    'model',
    'manufacturer',
    'cylinders']
        
    for var in factor_vars:
        x_input[var] = encoders[var].transform(x_input[var])
            
    transformed_input = x_input
    
    pred_mileage = ad_model.predict(transformed_input)
    print(pred_mileage)
    
    return pred_mileage > actual * 1.4

# example invocation
# Create a dictionary with your input data
data = {'miles': [32485.0], 'car_age_years': [2.459], 
        'model': ['Tundra'], 'manufacturer': ['Toyota'], 
        'cylinders': ['4 cylinders']}

# Create a dataframe from the dictionary
x_input = pd.DataFrame(data)

# pass df, encoders, and model
is_anomaly(x_input, label_encoders, ad_model)