In [1]:
import pandas as pd
from os import listdir
import re

##### The goal of cleaning and preparing the data is so that I can perform more useful analsis. Such as counting which car models are the most popular. The data that I have scraped includes a Model + (Some Add-on or package) in the name. So My first goal is to separate the model name from the extra add-ons

### Reading in Listing Data

In [2]:
def capitalize(make):
    
    make = re.sub("_", " ", make)
    make = make.split(" ")
    
    make = [i.capitalize() for i in make]
    
    make = " ".join(make)
    
    return make

def extract_year(name):
    return int(re.search(r"(\d{4})", name).group(1))

def remove_date_from_name(name):
    year = re.search(r"(\d{4})", name).group(1)
    return re.sub(year, '', name)

def clean_mileage(miles):
    
    mileage = re.sub(',', '', miles)
    mileage = re.sub(' mi.', '', mileage)
    
    return int(mileage)

def clean_rating_count(rating_count):
    
    words_to_remove = ['\(', '\)', ' reviews', ',', ' review']
    
    for i in words_to_remove:
        rating_count = re.sub(i, '', rating_count)
    
    return int(rating_count)

def clean_price(price):
    
    price = re.sub(',', '', price)
    price = price.strip('$')
    
    if price == 'Not Priced':
        return None
    
    return int(price)

In [4]:
files = listdir('../data/')

data_all = []

for i in files:

    if '.png' in i or '.txt' in i or 'all_makes' in i:
        continue

    file_path = "../data/{}".format(i)
    
    df = pd.read_csv(file_path, on_bad_lines = 'skip')
    df['Make'] = i[:-4]
    df = df.drop(columns=["Unnamed: 0"])
    
    data_all.append(df)
    
df_all = pd.concat(data_all, axis = 0, ignore_index=True)

df_transform = df_all.copy()
df_transform = df_transform.dropna()

df_transform['Make'] = df_all['Make'].apply(lambda x: capitalize(x))
df_transform['Name'] = df_transform['Name'].apply(lambda x: remove_date_from_name(x))
df_transform['Year'] = df_all['Name'].apply(lambda x: extract_year(x))
df_transform['Mileage'] = df_transform['Mileage'].apply(lambda x: clean_mileage(x))
df_transform['Rating Count'] = df_transform['Rating Count'].apply(lambda x: clean_rating_count(x))
df_transform['Price'] = df_transform['Price'].apply(lambda x: clean_price(x))

df_transform = df_transform.dropna()


df_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119643 entries, 0 to 122244
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Name          119643 non-null  object 
 1   Mileage       119643 non-null  int64  
 2   Dealer Name   119643 non-null  object 
 3   Rating        119643 non-null  float64
 4   Rating Count  119643 non-null  int64  
 5   Price         119643 non-null  float64
 6   Make          119643 non-null  object 
 7   Year          119643 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.2+ MB


In [21]:
df_transform.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Rating Count,Price,Make,Year
0,Acura TLX FWD,20051,Niello Acura,4.6,132,32568.0,Acura,2020
1,Acura MDX Technology,19061,Ed Voyles Acura,3.9,86,58991.0,Acura,2022
2,Acura TLX FWD,30131,Hiley Acura,3.7,61,31445.0,Acura,2020
3,Acura RDX,12097,Acura of Boston,4.7,600,40595.0,Acura,2019
4,Acura RDX A-Spec,16250,Fountain Acura,4.4,314,43997.0,Acura,2021


##### This Dataset contains all Car Models as well as some specs about each car. My goal is eventually join this data set with the data I have scraped.

In [11]:
df_makes_models_orig = pd.read_csv('../data/all_makes.csv')
df_makes_models = df_makes_models_orig
df_makes_models.head()

Unnamed: 0.1,Unnamed: 0,objectId,Year,Make,Model,Category,createdAt,updatedAt
0,0,ZRgPP9dBMm,2020,Audi,Q3,SUV,2020-01-27T20:44:17.665Z,2020-01-27T20:44:17.665Z
1,1,cptB1C1NSL,2020,Chevrolet,Malibu,Sedan,2020-01-27T20:44:17.665Z,2020-01-27T20:44:17.665Z
2,2,ElhqsRZDnP,2020,Cadillac,Escalade ESV,SUV,2020-01-27T20:44:17.665Z,2020-01-27T20:44:17.665Z
3,3,LUzyWMYJpW,2020,Chevrolet,Corvette,"Coupe, Convertible",2020-01-27T20:44:17.665Z,2020-01-27T20:44:17.665Z
4,4,rDkHakOBKP,2020,Acura,RLX,Sedan,2020-01-27T20:44:17.665Z,2020-01-27T20:44:17.665Z


##### This cell filters the make-model dataset to only include the car Makes that I have scraped

In [37]:
df_makes_models_filtered = pd.DataFrame()

capitalized_makes = [
    "Acura",
    "Buick",
    "Cadillac",
    "Chevrolet",
    "Chrysler",
    "GMC",
    "Ford",
    "Honda",
    "INFINITI",
    "Jeep",
    "Kia",
    "Mitsubishi",
    "Nissan",
    "Porsche",
    "Ram",
    "Subaru",
    "Toyota",
    "Volkswagen",
    "Volvo",
    "Alfa Romeo",
    "Rolls-Royce",
    "MINI",
    "FIAT",
    "Aston Martin",
    "Maserati",
    "BMW",
    "Mercedes-Benz"
]

for i in capitalized_makes:
    temp_df = df_makes_models[df_makes_models['Make'] == i]
    if len(temp_df) == 0:
        print("{} is not found in the dataframe.".format(i))
        
    else:
        
        df_makes_models_filtered = pd.concat([df_makes_models_filtered, temp_df])

df_makes_models = df_makes_models_filtered


In [44]:
df_makes_models.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6940 entries, 4 to 9830
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6940 non-null   int64 
 1   objectId    6940 non-null   object
 2   Year        6940 non-null   int64 
 3   Make        6940 non-null   object
 4   Model       6940 non-null   object
 5   Category    6940 non-null   object
 6   createdAt   6940 non-null   object
 7   updatedAt   6940 non-null   object
dtypes: int64(2), object(6)
memory usage: 488.0+ KB


#### Concatenating Together the Name and Make to search for the specific models in the scraped car listings


- Now, we start the process of cleaning data and looking for mismatches

In [47]:
df_makes_models['Name'] = df_makes_models['Make'] + ' ' + df_makes_models['Model']

In [86]:
makes_models = df_makes_models['Name'].unique().tolist()


# The datasets have a different naming convention for RAM trucks. This addition matches 60% of RAM listings
RAM_models = ['RAM 1500', 'RAM 2500', 'RAM 3500', 'RAM ProMaster']

#

makes_models.extend(RAM_models)

def partial_match(name):

    for i in makes_models:
        if i.lower() in name.lower():
            return i
        

df_transform['Make-Model'] = df_transform['Name'].apply(lambda x: partial_match(x))

In [87]:
df_missing = df_transform[df_transform['Make-Model'].isna()]
df_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14994 entries, 2686 to 118994
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          14994 non-null  object 
 1   Mileage       14994 non-null  int64  
 2   Dealer Name   14994 non-null  object 
 3   Rating        14994 non-null  float64
 4   Rating Count  14994 non-null  int64  
 5   Price         14994 non-null  float64
 6   Make          14994 non-null  object 
 7   Year          14994 non-null  int64  
 8   Join          14994 non-null  int64  
 9   join          14994 non-null  int64  
 10  Make-Model    0 non-null      object 
dtypes: float64(2), int64(5), object(4)
memory usage: 1.4+ MB


In [88]:
percent_null = df_transform[df_transform['Make-Model'].isna()]['Make'].value_counts() * 100 / df_transform['Make'].value_counts()
percent_null[~percent_null.isna()].sort_values(ascending=False)

Bmw              39.001122
Gmc              35.266221
Ford             34.450283
Chevrolet        26.356108
Rolls Royce      21.290323
Toyota           18.180016
Mercedes Benz    13.743729
Aston Martin     12.790698
Nissan            9.816741
Volkswagen        3.923297
Mini              2.904564
Kia               1.768415
Jeep              0.652277
Name: Make, dtype: float64

In [89]:
df_missing[df_missing['Make'] == 'Bmw']['Name'].value_counts()

 BMW 330                        393
 BMW 330 i                      279
 BMW 330 i xDrive               260
 BMW 530                        159
 BMW 530 i xDrive               138
                               ... 
 BMW 650 Gran Coupe i xDrive      1
 BMW 640 Gran Coupe i xDrive      1
 BMW ALPINA XB7                   1
 BMW 650 i                        1
 BMW 650 i xDrive                 1
Name: Name, Length: 78, dtype: int64

In [94]:
df_makes_models[df_makes_models['Make'] == 'BMW']['Model'].value_counts()

3 Series     30
7 Series     29
5 Series     27
X5           22
M3           21
X3           18
M5           17
Z4           15
6 Series     15
M6           13
X6           12
X6 M         10
X5 M          8
X1            8
8 Series      7
Z3            7
i3            6
1 Series      6
4 Series      6
2 Series      6
X4            5
i8            5
M4            5
M2            4
Z8            4
X7            3
Z4 M          3
M             3
X2            2
Alpina B7     2
Name: Model, dtype: int64

In [78]:
df_makes_models[df_makes_models['Make'] == 'Ram']['Model'].value_counts()

1500 Crew Cab               11
2500 Crew Cab               11
3500 Crew Cab               10
2500 Mega Cab               10
3500 Regular Cab             9
1500 Quad Cab                9
2500 Regular Cab             9
3500 Mega Cab                9
1500 Regular Cab             8
ProMaster Window Van         6
ProMaster City               6
ProMaster Cargo Van          6
C/V Tradesman                3
1500 Classic Regular Cab     1
1500 Classic Crew Cab        1
1500 Classic Quad Cab        1
ProMaster 3500 Cargo         1
ProMaster 2500 Cargo         1
ProMaster 1500 Cargo         1
C/V                          1
Dakota Crew Cab              1
Dakota Extended Cab          1
Name: Model, dtype: int64

In [10]:
df_transform[df_transform['Name'].str.contains('Acura RLX')]

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Rating Count,Price,Make,Year,Join,join
604,Acura RLX Sport Hybrid Advance Package,28549,Gary Force Acura,4.5,105,46995.0,Acura,2019,1,1
619,Acura RLX Sport Hybrid Advance Package,43187,McGrath Acura in Morton Grove,4.6,686,44000.0,Acura,2020,1,1
685,Acura RLX Sport Hybrid Advance Package,16881,Columbia Acura,3.9,39,48995.0,Acura,2020,1,1
725,Acura RLX w/Technology Pkg,34876,Hertrich Acura,4.5,343,40998.0,Acura,2020,1,1
865,Acura RLX Sport Hybrid Advance Package,22917,Ed Napleton Acura Kia,4.1,572,44495.0,Acura,2020,1,1
935,Acura RLX FWD w/Technology Package,18885,Southern Motors Acura,3.3,24,41654.0,Acura,2020,1,1
1115,Acura RLX FWD w/Technology Package,49825,Precision Acura,4.8,1477,37995.0,Acura,2019,1,1
1220,Acura RLX Technology Package,34976,Southern Motors Acura,3.3,24,36562.0,Acura,2018,1,1
1360,Acura RLX Sport Hybrid,21590,Hinshaw's Acura,4.7,20,51999.0,Acura,2019,1,1
1457,Acura RLX Sport Hybrid,19285,Acura of Chattanooga,4.8,14,49900.0,Acura,2020,1,1
