In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import statsmodels.api as sm
from datetime import timedelta, datetime

In [2]:
directory = 'C:\\Users\\Emma Hegermiller\\Git\\price-prediction\\data'


df_train = pd.read_csv(directory + '\\train_initial.csv').drop(columns=['Unnamed: 0'])
df_test = pd.read_csv(directory + '\\test_initial.csv').drop(columns=['Unnamed: 0'])

In [3]:
# Calculate data values for imputation

imputation_dict = {}

# Get median values for imputation of numeric variables
numeric_cols = ['mileage', 'accident_count',
       'fuel_economy_city', 'fuel_economy_highway', 'msrp']
for col in numeric_cols:
    median = df_train[col].median()
    imputation_dict[col] = median
    
# Get imputation values for boolean variables
boolean_cols = ['is_cpo', 'seller_is_franchise_dealer', 'seller_is_online_only', 'seller_ships_nationwide']
boolean_dict = {}
for col in boolean_cols:
    if len(df_train[df_train[col]==True])==0:
        imputation_dict[col] = True
    if len(df_train[df_train[col]==False])==0:
        imputation_dict[col] = False
    else:
        pass

# Get imputation for categorical variables with majority categies with subcategory counts < 0.1%
categorical_cols = ["trim", "transmission", "exterior_color"]
for col in categorical_cols:
    imputation_dict[col] = 'Other'

print(imputation_dict)

{'mileage': 6054.0, 'accident_count': 0.0, 'fuel_economy_city': 30.0, 'fuel_economy_highway': 38.0, 'msrp': 25075.0, 'is_cpo': False, 'seller_is_franchise_dealer': False, 'seller_is_online_only': True, 'seller_ships_nationwide': False, 'trim': 'Other', 'transmission': 'Other', 'exterior_color': 'Other'}


In [4]:
# Group by observations to summarize 

def summarize_vin(group1):
    """Summarizes observations using mode for categorical variables and median for numeric variables"""
    if len(group1) > 1:
        for col in group1.columns:
            if group1[col].count() > 1:
                if np.issubdtype(group1[col], np.number):
                    group1[col] = group1[col].median()
                else:
                    group1[col] = group1[col].value_counts().index[0]
    return group1

def groupby_vehicle(data):
    """Groups by vin and summarizes categorical and numberic variables that are considered static vehicle attributes"""
    static_cols = ['vin',
                   'year', 
                   'make', 
                   'model', 
                   'trim', 
                   'body_style', 
                   'transmission', 
                   'fuel_economy_city', 
                   'fuel_economy_highway', 
                   'exterior_color']
    df_out = data.groupby('vin')[static_cols].apply(summarize_vin).drop_duplicates()
    return df_out

def groupby_listing(data):
    """Groups by vin and listing_date_begin. 
    Summarizes categorical and numberic variables that are considered static vehicle attributes"""
    dynamic_cols = ['vin', 
                'listing_date_begin',
                'mileage', 
                'accident_count',
                'is_cpo',
                'seller_city', 
                'seller_state', 
                'seller_type',
                'seller_is_franchise_dealer', 
                'seller_is_online_only',
                'seller_ships_nationwide', 
                'msrp',
                'list_price']
    df_out = data.groupby(['vin', 'listing_date_begin'])[dynamic_cols].apply(summarize_vin).drop_duplicates()
    return df_out
    
def summarize_obs(data):
    df_vehicle = groupby_vehicle(data)
    df_listing = groupby_listing(data)
    df_out = df_listing.join(df_vehicle.set_index("vin"), on="vin")
    return df_out
    
df_tidy = summarize_obs(df_train)

print("Columns: n/{}".format(df_tidy.columns))
print("Length: {}".format(len(df_tidy)))
print(df_tidy.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_tidy_test = summarize_obs(df_test)


Columns: n/Index(['vin', 'listing_date_begin', 'mileage', 'accident_count', 'is_cpo',
       'seller_city', 'seller_state', 'seller_type',
       'seller_is_franchise_dealer', 'seller_is_online_only',
       'seller_ships_nationwide', 'msrp', 'list_price', 'year', 'make',
       'model', 'trim', 'body_style', 'transmission', 'fuel_economy_city',
       'fuel_economy_highway', 'exterior_color'],
      dtype='object')
Length: 887
                   vin listing_date_begin  mileage  accident_count is_cpo  \
140  19XFC1F30LE004389         2021-04-23   6874.0             0.0   True   
87   19XFC1F30LE004859         2021-04-01   5601.0             0.0   True   
0    19XFC1F30LE007261         2021-02-10   1924.0             0.0    NaN   
14   19XFC1F30LE014498         2021-03-26  14397.0             0.0    NaN   
285  19XFC1F30LE015490         2021-01-15   2769.0             0.0    NaN   
..                 ...                ...      ...             ...    ...   
147  19XFC1F38LE006505       

In [5]:
# Impute missing values

def impute_variables(data):
    """
    Use dataset values for imputation of missing variables
    """
    return data.fillna(value=imputation_dict).replace({True:1, False:0})

df_impute = impute_variables(df_tidy)

print("Columns: n/{}".format(df_impute.columns))
print("Length: {}".format(len(df_impute)))
print(df_impute.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_impute_test = impute_variables(df_tidy_test)

Columns: n/Index(['vin', 'listing_date_begin', 'mileage', 'accident_count', 'is_cpo',
       'seller_city', 'seller_state', 'seller_type',
       'seller_is_franchise_dealer', 'seller_is_online_only',
       'seller_ships_nationwide', 'msrp', 'list_price', 'year', 'make',
       'model', 'trim', 'body_style', 'transmission', 'fuel_economy_city',
       'fuel_economy_highway', 'exterior_color'],
      dtype='object')
Length: 887
                   vin listing_date_begin  mileage  accident_count  is_cpo  \
140  19XFC1F30LE004389         2021-04-23   6874.0             0.0       1   
87   19XFC1F30LE004859         2021-04-01   5601.0             0.0       1   
0    19XFC1F30LE007261         2021-02-10   1924.0             0.0       0   
14   19XFC1F30LE014498         2021-03-26  14397.0             0.0       0   
285  19XFC1F30LE015490         2021-01-15   2769.0             0.0       0   
..                 ...                ...      ...             ...     ...   
147  19XFC1F38LE006505

In [6]:
# Date Handling

def handle_date(data):
    #Extracting Yea
    data['listing_year'] = pd.to_datetime(data['listing_date_begin']).dt.year
    #Extracting Month
    map_month = {1:'Jan', 2:'Feb', 3:'Mar', 
                 4:'Apr', 5:'May', 6:'Jun', 
                 7:'Jul', 8:'Aug', 9:'Sep', 
                 10:'Oct', 11:'Nov', 12:'Dec'}
    data['listing_month'] = pd.to_datetime(data['listing_date_begin']).dt.month.replace(map_month)
    return data

df_date = handle_date(df_impute)

print("Columns: n/{}".format(df_date.columns))
print("Length: {}".format(len(df_date)))
print(df_date.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_date_test = handle_date(df_impute_test)

Columns: n/Index(['vin', 'listing_date_begin', 'mileage', 'accident_count', 'is_cpo',
       'seller_city', 'seller_state', 'seller_type',
       'seller_is_franchise_dealer', 'seller_is_online_only',
       'seller_ships_nationwide', 'msrp', 'list_price', 'year', 'make',
       'model', 'trim', 'body_style', 'transmission', 'fuel_economy_city',
       'fuel_economy_highway', 'exterior_color', 'listing_year',
       'listing_month'],
      dtype='object')
Length: 887
                   vin listing_date_begin  mileage  accident_count  is_cpo  \
140  19XFC1F30LE004389         2021-04-23   6874.0             0.0       1   
87   19XFC1F30LE004859         2021-04-01   5601.0             0.0       1   
0    19XFC1F30LE007261         2021-02-10   1924.0             0.0       0   
14   19XFC1F30LE014498         2021-03-26  14397.0             0.0       0   
285  19XFC1F30LE015490         2021-01-15   2769.0             0.0       0   
..                 ...                ...      ...          

In [7]:
# Split features with where it makes sense
df_date['seller_city'] = df_date.seller_city.str.split(",").str[0]

# Test
df_date_test['seller_city'] = df_date_test.seller_city.str.split(",").str[0]

In [8]:
# Handling Outliers

def handle_outliers(data):
    numeric_cols = ['mileage', 'accident_count',
       'fuel_economy_city', 'fuel_economy_highway', 'msrp']
    print(data.describe())
    for col in numeric_cols:
        upper_lim = data[col].quantile(.95)
        lower_lim = data[col].quantile(.05)
        data_out = data[(data[col] < upper_lim) & (data[col] > lower_lim)]
    return data_out

df_outlier = handle_outliers(df_date)

print("Columns: n/{}".format(df_outlier.columns))
print("Length: {}".format(len(df_outlier)))
print(df_outlier.describe())
print(df_outlier.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_outlier_test = handle_outliers(df_date_test)

            mileage  accident_count      is_cpo  seller_is_franchise_dealer  \
count    887.000000      887.000000  887.000000                  887.000000   
mean    7923.157835        0.032694    0.301015                    0.830891   
std     7334.233183        0.177936    0.458958                    0.375060   
min       56.000000        0.000000    0.000000                    0.000000   
25%     2740.000000        0.000000    0.000000                    1.000000   
50%     6054.000000        0.000000    0.000000                    1.000000   
75%    10902.500000        0.000000    1.000000                    1.000000   
max    66099.500000        1.000000    1.000000                    1.000000   

       seller_is_online_only  seller_ships_nationwide           msrp  \
count             887.000000               887.000000     887.000000   
mean                0.958286                 0.094701   27555.648782   
std                 0.200047                 0.292967   10377.321878   


In [9]:
# Normalization
def normalize(data):
    numeric_cols = ['mileage', 'accident_count','fuel_economy_city', 'fuel_economy_highway', 'msrp']
    for col in numeric_cols:
        data[col+'_normalized'] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
    return data

df_norm = normalize(df_outlier)

print("Columns: n/{}".format(df_norm.columns))
print("Length: {}".format(len(df_norm)))
print(df_norm.describe())
print(df_norm.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_norm_test = normalize(df_outlier_test)

Columns: n/Index(['vin', 'listing_date_begin', 'mileage', 'accident_count', 'is_cpo',
       'seller_city', 'seller_state', 'seller_type',
       'seller_is_franchise_dealer', 'seller_is_online_only',
       'seller_ships_nationwide', 'msrp', 'list_price', 'year', 'make',
       'model', 'trim', 'body_style', 'transmission', 'fuel_economy_city',
       'fuel_economy_highway', 'exterior_color', 'listing_year',
       'listing_month', 'mileage_normalized', 'accident_count_normalized',
       'fuel_economy_city_normalized', 'fuel_economy_highway_normalized',
       'msrp_normalized'],
      dtype='object')
Length: 797
            mileage  accident_count      is_cpo  seller_is_franchise_dealer  \
count    797.000000      797.000000  797.000000                  797.000000   
mean    7998.853827        0.035132    0.299875                    0.814304   
std     7276.293155        0.184228    0.458491                    0.389106   
min       56.000000        0.000000    0.000000              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col+'_normalized'] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col+'_normalized'] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())


In [10]:
# One hot encoding
def onehot(data):
    onehot_cols = ['year', 'make', 'model', 'body_style', 'seller_city', 
                   'seller_state', 'seller_type', 'listing_year', 'listing_month']
    df_out = data
    for col in onehot_cols:
        encoded_columns = pd.get_dummies(data[col], prefix=col)
        df_out = df_out.join(encoded_columns).drop(col, axis=1)
    return df_out

df_onehot = onehot(df_norm)

print("Columns: n/{}".format(df_onehot.columns))
print("Length: {}".format(len(df_onehot)))
print(df_onehot.sort_values(["vin", "listing_date_begin"]).head(100))

# Test
df_onehot_test = onehot(df_norm_test)

Columns: n/Index(['vin', 'listing_date_begin', 'mileage', 'accident_count', 'is_cpo',
       'seller_is_franchise_dealer', 'seller_is_online_only',
       'seller_ships_nationwide', 'msrp', 'list_price',
       ...
       'listing_month_Dec', 'listing_month_Feb', 'listing_month_Jan',
       'listing_month_Jul', 'listing_month_Jun', 'listing_month_Mar',
       'listing_month_May', 'listing_month_Nov', 'listing_month_Oct',
       'listing_month_Sep'],
      dtype='object', length=496)
Length: 22259
                   vin listing_date_begin  mileage  accident_count  is_cpo  \
140  19XFC1F30LE004389         2021-04-23   6874.0             0.0       1   
87   19XFC1F30LE004859         2021-04-01   5601.0             0.0       1   
0    19XFC1F30LE007261         2021-02-10   1924.0             0.0       0   
14   19XFC1F30LE014498         2021-03-26  14397.0             0.0       0   
285  19XFC1F30LE015490         2021-01-15   2769.0             0.0       0   
..                 ...        

In [11]:
# Make, model, body_style price

In [12]:
# Text feature extraction
# # vectorizing trim
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=3, max_features=250000)

# train_trim_tfidf = vectorizer.fit_transform(['trim'])
# # test_trim_tfidf = vectorizer.transform(test['trim'])


# # vectorixing transmission
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=500000)

# train_transmission_tfidf = vectorizer.fit_transform(train_df['transmission'])
# # test_transmission_tfidf = vectorizer.transform(test['transmission'])

# # vectorixing exterior_color
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=500000)

# train_exterior_color_tfidf = vectorizer.fit_transform(train_df['exterior_color'])
# # test_exterior_color_tfidf = vectorizer.transform(test['exterior_color'])

In [13]:
df_train_final = df_onehot.drop(columns=['vin','listing_date_begin', 'trim', 'transmission', 'exterior_color'])
df_test_final = df_onehot_test.drop(columns=['vin','listing_date_begin', 'trim', 'transmission', 'exterior_color'])

In [14]:
# Confirm no missing values 
# Train
row_total = len(df_train_final)
for col in df_train_final.columns:
    if df_train_final[col].isna().sum()/row_total > 0:
        print(df_train_final[col].isna().sum()/row_total)
    else:
        pass

# Test
row_total_test = len(df_test_final)
for col in df_test_final.columns:
    if df_test_final[col].isna().sum()/row_total_test > 0:
        print(df_test_final[col].isna().sum()/row_total_test)
    else:
        pass

In [15]:
# Confirm no categorical variables
# Train
pd.set_option('display.max_rows', 500)
df_train_final.dtypes

# Test
print(df_test_final.dtypes)

mileage                             float64
accident_count                      float64
is_cpo                                int64
seller_is_franchise_dealer            int64
seller_is_online_only                 int64
seller_ships_nationwide               int64
msrp                                float64
list_price                          float64
fuel_economy_city                   float64
fuel_economy_highway                float64
mileage_normalized                  float64
accident_count_normalized           float64
fuel_economy_city_normalized        float64
fuel_economy_highway_normalized     float64
msrp_normalized                     float64
year_2020.0                           uint8
year_2021.0                           uint8
make_Ford                             uint8
make_Honda                            uint8
model_Civic                           uint8
model_Explorer                        uint8
model_Insight                         uint8
model_Mustang                   

In [16]:
# Export data 
df_train_final.to_csv(directory + '\\train.csv')
df_test_final.to_csv(directory + '\\test.csv')