In [72]:
import pandas as pd
from os import listdir
import re

import plotly.express as px

##### The goal of cleaning and preparing the data is so that I can perform more useful analsis. Such as counting which car models are the most popular. The data that I have scraped includes a Model + (Some Add-on or package) in the name. So My first goal is to separate the model name from the extra add-ons

### Reading in Listing Data

In [59]:
def capitalize(make):
    
    make = re.sub("_", " ", make)
    make = make.split(" ")
    
    make = [i.capitalize() for i in make]
    
    make = " ".join(make)
    
    return make

def extract_year(name):
    return int(re.search(r"(\d{4})", name).group(1))

def remove_date_from_name(name):
    year = re.search(r"(\d{4})", name).group(1)
    return re.sub(year, '', name)

def clean_mileage(miles):
    
    mileage = re.sub(',', '', miles)
    mileage = re.sub(' mi.', '', mileage)
    
    return int(mileage)

def clean_rating_count(rating_count):
    
    words_to_remove = ['\(', '\)', ' reviews', ',', ' review']
    
    for i in words_to_remove:
        rating_count = re.sub(i, '', rating_count)
    
    return int(rating_count)

def clean_price(price):
    
    price = re.sub(',', '', price)
    price = price.strip('$')
    
    if price == 'Not Priced':
        return None
    
    return int(price)

In [60]:
files = listdir('../data/scraped_car_listings_data/')

data_all = []

for i in files:

    if '.png' in i or '.txt' in i or 'all_makes' in i:
        continue

    file_path = "../data/scraped_car_listings_data/{}".format(i)
    
    df = pd.read_csv(file_path, on_bad_lines = 'skip')
    df['Make'] = i[:-4]
    df = df.drop(columns=["Unnamed: 0"])
    
    data_all.append(df)
    
df_all = pd.concat(data_all, axis = 0, ignore_index=True)

df_transform = df_all.copy()
df_transform = df_transform.dropna()

df_transform['Make'] = df_all['Make'].apply(lambda x: capitalize(x))
df_transform['Name'] = df_transform['Name'].apply(lambda x: remove_date_from_name(x))
df_transform['Year'] = df_all['Name'].apply(lambda x: extract_year(x))
df_transform['Mileage'] = df_transform['Mileage'].apply(lambda x: clean_mileage(x))
df_transform['Rating Count'] = df_transform['Rating Count'].apply(lambda x: clean_rating_count(x))
df_transform['Price'] = df_transform['Price'].apply(lambda x: clean_price(x))

df_transform = df_transform.dropna()


df_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119643 entries, 0 to 122244
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Name          119643 non-null  object 
 1   Mileage       119643 non-null  int64  
 2   Dealer Name   119643 non-null  object 
 3   Rating        119643 non-null  float64
 4   Rating Count  119643 non-null  int64  
 5   Price         119643 non-null  float64
 6   Make          119643 non-null  object 
 7   Year          119643 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.2+ MB


In [61]:
df_transform.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Rating Count,Price,Make,Year
0,Acura TLX FWD,20051,Niello Acura,4.6,132,32568.0,Acura,2020
1,Acura MDX Technology,19061,Ed Voyles Acura,3.9,86,58991.0,Acura,2022
2,Acura TLX FWD,30131,Hiley Acura,3.7,61,31445.0,Acura,2020
3,Acura RDX,12097,Acura of Boston,4.7,600,40595.0,Acura,2019
4,Acura RDX A-Spec,16250,Fountain Acura,4.4,314,43997.0,Acura,2021


##### This Dataset contains all Car Models as well as some specs about each car. My goal is eventually join this data set with the data I have scraped.

In [62]:
df_makes_models_orig = pd.read_csv('../data/all_makes.csv')
df_makes_models = df_makes_models_orig
df_makes_models.head()

Unnamed: 0,Year,Make,Model,Category,Make-Model
0,2020,Acura,RLX,Sedan,Acura RLX
1,2020,Acura,TLX,Sedan,Acura TLX
2,2020,Acura,MDX,SUV,Acura MDX
3,2020,Acura,RDX,SUV,Acura RDX
4,2019,Acura,MDX Sport Hybrid,SUV,Acura MDX Sport Hybrid


##### This cell filters the make-model dataset to only include the car Makes that I have scraped

In [63]:
df_makes_models_filtered = pd.DataFrame()

capitalized_makes = [
    "Acura",
    "Buick",
    "Cadillac",
    "Chevrolet",
    "Chrysler",
    "GMC",
    "Ford",
    "Honda",
    "INFINITI",
    "Jeep",
    "Kia",
    "Mitsubishi",
    "Nissan",
    "Porsche",
    "Ram",
    "Subaru",
    "Toyota",
    "Volkswagen",
    "Volvo",
    "Alfa Romeo",
    "Rolls-Royce",
    "MINI",
    "FIAT",
    "Aston Martin",
    "Maserati",
    "BMW",
    "Mercedes-Benz"
]

makes = [
    "acura",
    "buick",
    "cadillac",
    "chevrolet",
    "chrysler",
    "gmc",
    "ford",
    "honda",
    "infiniti",
    "jeep",
    "kia",
    "mitsubishi",
    "nissan",
    "porsche",
    "ram",
    "subaru",
    "toyota",
    "volkswagen",
    "volvo",
    "alfa_romeo",
    "rolls_royce",
    "mini",
    "fiat",
    "aston_martin",
    "maserati",
    "bmw",
    "mercedes_benz"
]


original_names = [capitalize(i) for i in makes]

conversion = {k: v for k, v in zip(capitalized_makes, original_names)}


for i in capitalized_makes:
    temp_df = df_makes_models[df_makes_models['Make'] == i]
    temp_df['Make'] = conversion[i]
    
    if len(temp_df) == 0:
        print("{} is not found in the dataframe.".format(i))
        
    else:
        
        df_makes_models_filtered = pd.concat([df_makes_models_filtered, temp_df])

df_makes_models = df_makes_models_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Make'] = conversion[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Make'] = conversion[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Make'] = conversion[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [55]:
# df_makes_models.to_csv('../data/makes_models.csv', index=False)

In [64]:
df_makes_models.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6940 entries, 0 to 6939
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        6940 non-null   int64 
 1   Make        6940 non-null   object
 2   Model       6940 non-null   object
 3   Category    6940 non-null   object
 4   Make-Model  6940 non-null   object
dtypes: int64(1), object(4)
memory usage: 325.3+ KB


##### This loop checks if there are exact substring matches of a car model. 
-  This makes it so that cars like 'BMW 650 Gran Coupe i xDrive' can be grouped into '6 series' vehicles.
- We can also ignore technology packages for certain queries. Overall, it will help with grouping and aggregating data

In [66]:
makes_models = df_makes_models['Model'].unique().tolist()

RAM_models = ['RAM 1500', 'RAM 2500', 'RAM 3500', 'RAM ProMaster']
Rolls_Royce_models = ['Rolls-Royce Cullinan']
Aston_Martin_models = ['DBX', 'Vantage']
Nissan_models = ['Frontier', 'Titan']
Toyota_models = ['Tacoma', 'Tundra', 'GR86']
GMC_models = ['Canyon']
Chevrolet_models = ['Colorado']
Volkswagen_models = ['ID.4', 'Taos']
Ford_models = ['Ranger', 'Maverick', 'Transit']



makes_models.extend(RAM_models)
makes_models.extend(Rolls_Royce_models)
makes_models.extend(Aston_Martin_models)
makes_models.extend(Nissan_models)
makes_models.extend(Toyota_models)
makes_models.extend(GMC_models)
makes_models.extend(Chevrolet_models)
makes_models.extend(Volkswagen_models)
makes_models.extend(Ford_models)

def string_together_amg(name):
    name = re.sub('Benz AMG', 'Benz-AMG', name)
    return name

def partial_match(name):

    for i in makes_models:
        
        lower = i.lower()
        
        
        # checking for substring matches
        if lower in name.lower():
            return i
        
        # matching BMW's
        elif 'bmw' in lower:
            res = re.search(r"(\d{3})", name)
            if res:
                return "{} Series".format(res.group(1)[0])
            
df_transform['Name'] = df_transform['Name'].apply(lambda x: string_together_amg(x))
df_transform['Make-Model'] = df_transform['Name'].apply(lambda x: partial_match(x))

In [67]:
df_missing = df_transform[df_transform['Make-Model'].isna()]
df_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1835 entries, 27860 to 37958
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          1835 non-null   object 
 1   Mileage       1835 non-null   int64  
 2   Dealer Name   1835 non-null   object 
 3   Rating        1835 non-null   float64
 4   Rating Count  1835 non-null   int64  
 5   Price         1835 non-null   float64
 6   Make          1835 non-null   object 
 7   Year          1835 non-null   int64  
 8   Make-Model    0 non-null      object 
dtypes: float64(2), int64(3), object(4)
memory usage: 143.4+ KB


In [71]:
df_makes_models[df_makes_models['Make'] == 'Ford']['Model'].value_counts()

Mustang                        30
Explorer                       30
F150 Regular Cab               28
Taurus                         28
F150 Super Cab                 28
                               ..
Festiva                         2
F150 (Heritage) Regular Cab     1
F150 (Heritage) Super Cab       1
Bronco Sport                    1
Transit Connect Cargo Van       1
Name: Model, Length: 87, dtype: int64

In [68]:
'''
% Non-matching before adding the changes
Ram              99.254948
Bmw              39.001122
Gmc              35.266221
Ford             34.450283
Chevrolet        26.356108
Rolls Royce      21.290323
Toyota           18.180016
Mercedes Benz    13.743729
Aston Martin     12.790698
Nissan            9.816741
Volkswagen        3.923297
Mini              2.904564
Kia               1.768415
Jeep              0.652277
'''

# Returns a series indicating what % of each Make is not yet matches
def percent_non_match(df_percent_non_match):
    percent_null = df_percent_non_match[df_percent_non_match['Make-Model'].isna()]['Make'].value_counts() * 100 / df_percent_non_match['Make'].value_counts()
    return percent_null[~percent_null.isna()].sort_values(ascending=False)


# New Missing percentages
percent_non_match(df_transform)

Ford    18.191732
Name: Make, dtype: float64

In [79]:
data = pd.DataFrame(df_transform[df_transform['Make'] == 'Acura']['Make-Model'].value_counts())
data = data.rename(columns = {'Make-Model': 'Count'})
data

Unnamed: 0,Count
MDX,1064
RDX,748
TLX,573
ILX,112
RLX,20
NSX,8
