In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import IsolationForest

# stops the warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# Approach 1: Isolation Forest

In [2]:
df = pd.read_pickle('pickle_files/01_df.pkl')

In [3]:
# Specify columns to use for anomaly detection
columns = ['condition', 'model', 'odometer', 'title_status', 'transmission', 'type', 'year']

# Create a subset DataFrame with only the specified columns
df_subset = df[columns]

# Convert necessary str columns to int columns
df_subset.loc[:, 'condition'] = df_subset['condition'].rank(method='dense', ascending=False).astype(int)
df_subset.loc[:, 'model'] = df_subset['model'].rank(method='dense', ascending=False).astype(int)
df_subset.loc[:, 'title_status'] = df_subset['title_status'].rank(method='dense', ascending=False).astype(int)
df_subset.loc[:, 'transmission'] = df_subset['transmission'].rank(method='dense', ascending=False).astype(int)
df_subset.loc[:, 'type'] = df_subset['type'].rank(method='dense', ascending=False).astype(int)

# Instantiate the IsolationForest algorithm with default parameters
model = IsolationForest()

# Fit the model to the subset DataFrame
model.fit(df_subset)

# Predict the anomaly scores for each row in the subset DataFrame
anomaly_scores = model.decision_function(df_subset)

# Add the anomaly scores to the original DataFrame
df['anomaly_score'] = anomaly_scores

# Identify rows with high anomaly scores (i.e. potential anomalies)
anomalies = df[df['anomaly_score'] < 0]
df['anomaly'] = [1 if score < 0 else 0 for score in anomaly_scores]

In [4]:
df

Unnamed: 0,price,condition,cylinders,drive,fuel,lat,long,manufacturer,model,odometer,paint_color,posting_date,region,state,title_status,transmission,type,year,anomaly_score,anomaly
27,33590,good,8 cylinders,missing,gas,32.590000,-85.480000,gmc,GMC Sierra,57923.0,white,2021-05-04,auburn,al,clean,other,pickup,2014.0,0.015333,0
28,22590,good,8 cylinders,missing,gas,32.590000,-85.480000,chevrolet,Chevrolet Silverado,71229.0,blue,2021-05-04,auburn,al,clean,other,pickup,2010.0,-0.025646,1
29,39590,good,8 cylinders,missing,gas,32.590000,-85.480000,chevrolet,Chevrolet Silverado,19160.0,red,2021-05-04,auburn,al,clean,other,pickup,2020.0,0.007101,0
30,30990,good,8 cylinders,missing,gas,32.590000,-85.480000,toyota,Toyota Tundra,41124.0,red,2021-05-04,auburn,al,clean,other,pickup,2017.0,-0.008955,1
31,15000,excellent,6 cylinders,rwd,gas,32.592000,-85.518900,ford,Ford F-150,128000.0,black,2021-05-03,auburn,al,clean,automatic,truck,2013.0,0.071968,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426875,23590,good,6 cylinders,fwd,gas,33.786500,-84.445400,nissan,Nissan Maxima,32226.0,missing,2021-04-04,wyoming,wy,clean,other,sedan,2019.0,0.010819,0
426876,30590,good,missing,fwd,gas,33.786500,-84.445400,volvo,Volvo S60,12029.0,red,2021-04-04,wyoming,wy,clean,other,sedan,2020.0,-0.045623,1
426877,34990,good,missing,missing,diesel,33.779214,-84.411811,cadillac,Cadillac XT4,4174.0,white,2021-04-04,wyoming,wy,clean,other,hatchback,2020.0,-0.023379,1
426878,28990,good,6 cylinders,fwd,gas,33.786500,-84.445400,lexus,Lexus ES 350,30112.0,silver,2021-04-04,wyoming,wy,clean,other,sedan,2018.0,0.019488,0


In [None]:
# Results really aren't great here. I figured it'd be better to just build a regressor to predict mileage. See below. 

# Approach 2: XGBoost Regressor

In [5]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# stops the warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'


df = pd.read_pickle('pickle_files/01_df.pkl')
df = df.drop(['region', 'state'], axis=1)
df = df.replace({'missing': pd.NA})

# Drop NA in modelled columns
df = df.dropna(subset=['condition', 'title_status'])

# Remove cars less than $1k
df = df[df['price'] >= 1000]

df = df.rename(columns={'odometer': 'miles'})

# Get approximate age of car in years
df['year'] = pd.to_datetime(df['year'], format='%Y')
df['car_age_years'] = (df['posting_date'] - df['year'])
df['car_age_years'] = df['car_age_years'].clip(pd.Timedelta('0 days'))
df['car_age_years'] /= pd.Timedelta('365 days')
df = df.drop(['posting_date', 'year'], axis=1)

df['car_age_years'].head()

def get_manufacturer(s: str) -> str:

    if 'Mazda' in s:
        return 'Mazda'
    elif 'Alfa Romeo' in s:
        return 'Alfa-Romeo'
    elif 'Range Rover' in s:
        return 'Land Rover'
    else:
        r = s.split()[0]
        return r if r != 'Mercur' else 'Mercury'

df['manufacturer'] = df['model'].map(get_manufacturer)
df['manufacturer'].value_counts().head()

def get_model(s: str) -> str:
    if 'Mazda' in s:
        return s
    elif 'Alfa Romeo' in s:
        return 'Stevio'
    elif 'Range Rover' in s:
        return 'Range Rover Sport'
    else: return ' '.join(s.split()[1:])

# Note no model name exists with 2 manufacturers
df['model'] = df['model'].map(get_model)
df['model'].value_counts().head()


# For each model, get most common type
# and change all other types to that type
for model in df['model'].unique():
    most_common_type = df[df['model']==model]['type'].value_counts().index[0]
    df.loc[df['model']==model, 'type'] = most_common_type


print(
    'number of model-type unique pairs: '
    f'{df[["model", "type"]].value_counts().shape[0]}'
)

# Drop single observation with 12 cylinders
df = df[df['cylinders'] != '12 cylinders']

# Drop <50 observations with 'parts only' title
df = df[df['title_status'] != 'parts only']

# Change na to 'nan'
df = df.fillna('nan')

factor_vars = [
    'condition', 
    'title_status', 
    'fuel', 
    'type',
    'model',
    'manufacturer',
    'cylinders', 
    'drive', 
    'transmission', 
    'paint_color'
]

# Change column order
df = df[['miles', 'car_age_years']+factor_vars+['long', 'lat']]


# Encode categorical variables
label_encoders = {}
for var in factor_vars:
    le = LabelEncoder().fit(df[var])
    df[var] = le.transform(df[var])
    label_encoders[var] = le

train, test = train_test_split(df, test_size=0.20, random_state=0)

X_train = train.drop('miles', axis=1)
X_test = test.drop('miles', axis=1)

y_train = train['miles']
y_test = test['miles']

xgb_X_train = pd.get_dummies(X_train, columns=factor_vars)
xgb_X_test = pd.get_dummies(X_test, columns=factor_vars)

xgb_reg = XGBRegressor(
    n_estimators=300,
    max_depth=7,
    random_state=0,
    eval_metrics=mean_squared_error,
    verbosity=0
)
xgb_reg.fit(xgb_X_train, y_train)

xgb_X_train['predicted_mileage'] = xgb_reg.predict(xgb_X_train)
xgb_X_test['predicted_mileage'] = xgb_reg.predict(xgb_X_test)

df_pred = pd.concat([xgb_X_train, xgb_X_test], ignore_index=False)

df_pred = df_pred[['predicted_mileage']]

final_df = df.merge(df_pred, left_index=True, right_index=True)

final_df['anomaly'] = final_df['predicted_mileage'] > df['miles'] * 1.2

number of model-type unique pairs: 200


In [6]:
final_df[final_df['anomaly'] == True]

Unnamed: 0,miles,car_age_years,condition,title_status,fuel,type,model,manufacturer,cylinders,drive,transmission,paint_color,long,lat,predicted_mileage,anomaly
35,17302.0,5.339726,2,0,2,6,36,6,4,0,2,9,-85.480000,32.590000,22905.876953,True
41,17805.0,7.336986,2,0,4,6,168,31,6,2,2,9,-85.480000,32.590000,24256.939453,True
44,1834.0,2.331507,2,0,4,6,141,9,6,2,2,0,-85.480000,32.590000,9435.153320,True
47,20856.0,3.328767,2,0,2,6,67,9,4,2,2,11,-85.480000,32.590000,32962.175781,True
55,88000.0,17.336986,2,0,0,8,68,9,5,0,0,1,-85.468200,32.547500,198061.656250,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426864,53475.0,5.263014,2,0,2,0,8,31,4,2,0,0,-84.445400,33.786500,75182.601562,True
426866,55612.0,5.263014,2,0,2,0,136,13,5,3,0,0,-84.411811,33.779214,71157.273438,True
426870,3066.0,1.257534,2,0,2,7,157,12,6,1,2,1,-84.445400,33.786500,7295.062012,True
426873,15080.0,3.257534,2,0,2,7,4,7,6,3,0,11,-84.411811,33.779214,22244.552734,True


In [7]:
# If predicted mileage is 20% greater than actual mileage, we'll consider this an anomaly. I think we can tinker with this threshold. 