In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
cars = pd.read_csv("./vehicles.csv")
cars = cars.rename(columns={
    "year": "entry_year",
    "title_status": "vehicle_status",
    "size": "vehicle_size",
    "type": "vehicle_type"
})
cars.head()

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
print(f"Length before removing duplicates: {len(cars)}")

clean_cars = cars.drop(["id", "url", "region", "region_url", "VIN", "image_url", "description", "county", "state", "lat", "long", "posting_date"], axis=1)
clean_cars = clean_cars.drop_duplicates()

print(f"Length after removing duplicates: {len(clean_cars)}")

### Dealing with missing values

In [None]:
def print_null_values_count_per_column(dataframe):
    end_output = ""
    for column in dataframe.columns:
        end_output += f"nulls in {column}: {len(dataframe[dataframe[column].isnull()])},\n"
    end_output = end_output.rstrip(",\n")
    print(end_output)

print_null_values_count_per_column(clean_cars)

In [None]:
print(f"Length before removing same cars different price: {len(clean_cars)}")

# Car model is essential for predicting price, thus null values are dropped
no_nulls = clean_cars.copy()
no_nulls = no_nulls.dropna(subset="model")

# year and odometer nulls are difficult to fill, since there are few of them they will be dropped
no_nulls = no_nulls.dropna(subset=["entry_year", "odometer"])

# for columns with few null values, merge them in the most common category
# otherwise place them in their own "unknown" group
no_nulls.manufacturer = no_nulls.manufacturer.fillna("unknown")
no_nulls.condition = no_nulls.condition.fillna("unknown")
no_nulls.cylinders = no_nulls.cylinders.fillna("unknown")
no_nulls.fuel = no_nulls.fuel.fillna("gas")
no_nulls.vehicle_status = no_nulls.vehicle_status.fillna("clean")
no_nulls.transmission = no_nulls.transmission.fillna("automatic")
no_nulls.drive = no_nulls.drive.fillna("unknown")
no_nulls.vehicle_size = no_nulls.vehicle_size.fillna("unknown")
no_nulls.vehicle_type = no_nulls.vehicle_type.fillna("unknown")
no_nulls.paint_color = no_nulls.paint_color.fillna("unknown")

print(f"Length after removing same cars different price: {len(no_nulls)}")

In [None]:
print_null_values_count_per_column(no_nulls)

### Removing all rows that describe the same car but different price

In [None]:
print(f"Length before removing same cars different price: {len(no_nulls)}")

rows_to_remove = no_nulls[no_nulls.drop("price", axis=1).duplicated(keep=False)].index
no_nulls = no_nulls.drop(rows_to_remove, axis=0)

print(f"Length after removing same cars different price: {len(no_nulls)}")

In [None]:
no_nulls[no_nulls.price > 1000000].head()

## Removing outliers

In [None]:
# Keep all prices under 1M$ because big prices mess with the histogram below
no_outliers = no_nulls.copy()
no_outliers.price = no_outliers.price[no_outliers.price < 1000000]

In [None]:
# Create a histogram of every column that could have outliers to see which ones have outliers
# Alongside there will be plotted 2 vertical lines representing the bounds for eliminating outliers
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=75, rwidth=0.8)
    
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)

    ax.axvline(x=lower_bound, color='b')
    ax.axvline(x=upper_bound, color='b')
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")
    if column_name != "entry_year":
        ax.set_yscale("log")
        ax.set_title(f"Distribution of {column_name} (logarithmic scale)")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

In [None]:
print(f"Length before removing outliers: {len(no_nulls)}\n")

columns_used_for_removing_outliers = ["price", "entry_year", "odometer"]

for column_name in columns_used_for_removing_outliers:
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)
    
    percentage_removed = round((((no_outliers[column_name] < lower_bound) | (no_outliers[column_name] > upper_bound)).sum() / len(no_outliers)) * 100, 2)

    print(f"For column {column_name}, removing a percentage of {percentage_removed}% values.")
    no_outliers = no_outliers[(lower_bound <= no_outliers[column_name]) & (no_outliers[column_name] <= upper_bound)]

print(f"\nLength after removing outliers: {len(no_outliers)}")

In [None]:
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=25, rwidth=0.8)
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

## Feature engineering

### Changing string columns to numerical columns where possible

In [None]:
final_df = no_outliers.copy()

final_df.condition = final_df.condition.map({
    "unknown": -1,
    "salvage": 0,
    "fair": 1,
    "good": 2,
    "excellent": 3,
    "like new": 4,
    "new": 5
})
final_df.cylinders = final_df.cylinders.map({
    "unknown": -1,
    "other": 0,
    "3 cylinders": 3,
    "4 cylinders": 4,
    "5 cylinders": 5,
    "6 cylinders": 6,
    "8 cylinders": 8,
    "10 cylinders": 10,
    "12 cylinders": 12
})
final_df.vehicle_size = final_df.vehicle_size.map({
    "unknown": -1,
    "sub-compact": 0,
    "compact": 1,
    "mid-size": 2,
    "full-size": 3
})

final_df.price = final_df.price.astype(int)
final_df.entry_year = final_df.entry_year.astype(int)
final_df.odometer = final_df.odometer.astype(int)

final_df.head()

### Erasing models that don't appear often

In [None]:
model_counts = final_df.model.value_counts()
values_to_keep = model_counts[model_counts >= 10].index
final_df = final_df[final_df.model.isin(values_to_keep)]

final_df.model.value_counts()

In [None]:
final_df.vehicle_size[final_df.vehicle_size.isnull()]

In [None]:
no_outliers.columns

In [None]:
len(final_df)

In [None]:
no_outliers.model.value_counts()

In [None]:
final_df.manufacturer.value_counts()

In [None]:
final_df.manufacturer.value_counts()

In [None]:
remove_useless = no_nulls.copy()
values_to_replace = remove_useless.model.value_counts()[remove_useless.model.value_counts() < 1000].index
remove_useless.loc[remove_useless.model.isin(values_to_replace), 'model'] = np.nan
remove_useless = remove_useless.rename(columns={'size': 'size1'})
remove_useless.model.value_counts()

In [None]:
manufacturer_dummies = pd.get_dummies(remove_useless.manufacturer, drop_first=True)
model_dummies = pd.get_dummies(remove_useless.model, drop_first=True)
condition_dummies = pd.get_dummies(remove_useless.condition, drop_first=True)
cylinders_dummies = pd.get_dummies(remove_useless.cylinders, drop_first=True)
fuel_dummies = pd.get_dummies(remove_useless.fuel, drop_first=True)
title_status_dummies = pd.get_dummies(remove_useless.title_status, drop_first=True)
transmission_dummies = pd.get_dummies(remove_useless.transmission, drop_first=True)
drive_dummies = pd.get_dummies(remove_useless.drive, drop_first=True)
size1_dummies = pd.get_dummies(remove_useless.size1, drop_first=True)
type_dummies = pd.get_dummies(remove_useless.type, drop_first=True)
paint_color_dummies = pd.get_dummies(remove_useless.paint_color, drop_first=True)
final_df = pd.concat([remove_useless, manufacturer_dummies, model_dummies, condition_dummies, cylinders_dummies,
                     fuel_dummies, title_status_dummies, transmission_dummies, drive_dummies, size1_dummies,
                     type_dummies, paint_color_dummies], axis="columns")
final_df = final_df.drop(["manufacturer", "model", "condition", "cylinders", "fuel", "title_status", "transmission", 
                         "drive", "size1", "type", "paint_color"], axis=1)
final_df.head()

In [None]:
final_df = final_df.head(5000)

In [None]:
final_df.to_csv("please_god.csv")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVR

X = final_df.drop("price", axis=1)
y = final_df.price

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LinearRegression())
    ],
    'params': {}
  },
  'suppor_vector_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', SVR())
    ],
    'params': {
        'regressor__kernel': ['linear'],
        'regressor__C': [10],
        'regressor__epsilon': [0.1],
        'regressor__gamma': ['auto']
    }
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

## Feature engineering

### Extracting meaningful words from name column

In [None]:
name_meaningful_words_df = no_outliers_df.copy()
name_meaningful_words_df.name = name_meaningful_words_df.name.str.lower()
# Extract number of bedrooms in a separate column
name_meaningful_words_df["nr_of_bedrooms"] = name_meaningful_words_df.name.str.extract('(\d+)\s*-?\+?(?:br|bd|bed)')
name_meaningful_words_df["has_nr_of_bedrooms"] = ~name_meaningful_words_df["nr_of_bedrooms"].isnull()
name_meaningful_words_df["nr_of_bedrooms"] = name_meaningful_words_df["nr_of_bedrooms"].fillna(0)

def most_frequent_words_in_name_column(dataframe):
    # filter anything that is not helpful, or any words that are related to location
    filtered_strings = {"in", "private", "to", "bedroom", "the", "room", "home", "with", "apartment", "near", "and", "house", "of", 
    "a", "from", "bed", "apt", "bath", "close", "view", "suite", "on", "walk", "location", "for", "east", "by", "at", "hollywood", 
    "city", "one", "austin", "unit", "br", "nr", "west", "s", "d", "dt", "no", "bd", "b", "la", "full", "brooklyn", "san", "brand", 
    "hill", "steps", "minutes", "bathroom", "strip", "vegas", "manhattan", "prime", "las", "hills", "south", "mins", "hot", "guest", 
    "two", "free", "min", "hotel", "nyc", "place", "w", "floor", "stay", "nashville", "space", "shared", "away", "north", "entire", 
    "beds", "sleeps", "bay", "williamsburg", "side", "living", "kitchen", "rental", "square", "district", "located", "street", "area", "your", 
    "upper", "neighborhood", "pet", "st", "bdrm", "venice", "los", "style", "santa", "bedrooms", "blocks", "heights", "diego", 
    "block", "next", "miles", "capitol", "long", "centre", "dtla", "top", "broadway", "seattle", "mission", "all"}

    word_frequencies = dataframe.name.str.split('\\W+', expand=True).stack().value_counts()
    word_frequencies = word_frequencies[(word_frequencies >= 1000) & word_frequencies.index.str.isalpha()]
    word_frequencies = word_frequencies[~word_frequencies.index.isin(filtered_strings)]
    
    return word_frequencies

most_frequent_words_in_name_column(name_meaningful_words_df)

In [None]:
# Frequent words have to be manually extracted because similar words were grouped together to reduce number of columns
# And some were removed due to not being useful
meaningful_words_with_column_name = [
    ["peaceful", ["quiet", "oasis", "retreat", "getaway", "peaceful"]],
    ["luxurios", ["luxury", "luxurious", "resort", "deluxe"]],
    ["lovely", ["lovely", "charming", "cute", "nice"]],
    ["spacious", ["spacious", "large", "huge", "big"]],
    ["comfortable", ["cozy", "comfy", "comfortable"]],
    ["central", ["downton", "heart", "central", "midtown"]],
    ["amazing", ["great", "amazing", "best", "perfect", "paradise"]],
    ["beautiful", ["beautiful", "gorgeous", "stunning"]],
    ["bright", ["sunny", "bright", "cheerful"]],
    ["renovated", ["renovated", "remodeled", "newly"]],
    ["backyard", ["garden", "backyard", "yard"]],
    ["beach", ["beach", "ocean"]],
    ["pool", ["pool", "tub"]],
    ["patio", ["deck", "patio"]],
    ["stylish", ["stylish", "chic"]],
    ["convenient", ["convenient", "everything"]],
    ["townhouse", ["townhouse", "townhome"]],
    ["urban", ["urban", "town",]],
    ["modern", ["modern", "new",]],
    ["village", ["village"]],
    ["cottage", ["cottage"]],
    ["studio", ["studio"]],
    ["condo", ["condo"]],
    ["bungalow", ["bungalow"]],
    ["villa", ["villa"]],
    ["penthouse", ["penthouse"]],
    ["duplex", ["duplex"]],
    ["loft", ["loft"]],
    ["king", ["king"]],
    ["queen", ["queen"]],
    ["master", ["master"]],
    ["gym", ["gym"]],
    ["entrance", ["entrance"]],
    ["balcony", ["balcony"]],
    ["rooftop", ["rooftop"]],
    ["mid", ["mid"]],
    ["victorian", ["victorian"]],
    ["historic", ["historic"]],
    ["gem", ["gem"]],
    ["clean", ["clean"]],
    ["furnished", ["furnished"]],
    ["family", ["family"]],
    ["friendly", ["friendly"]],
    ["views", ["views"]],
    ["valley", ["valley"]],
    ["park", ["park"]],
    ["lake", ["lake"]],
    ["waterfront", ["waterfront"]],
    ["vacation", ["vacation"]],
    ["spa", ["spa"]],
    ["heated", ["heated"]],
    ["airport", ["airport"]],
    ["parking", ["parking"]],
    ["wifi", ["wifi"]]
]

# for each meaningful word column (column_name) create a new column in the dataframe with its name (e.g. luxurious)
# split each row's name with delimiters being whitespace or special characters
# if any word in the split name is inside a list of words (values_to_search)
# set the value for the current column to True, otherwise set to False
for column_name, values_to_search in meaningful_words_with_column_name:
    name_meaningful_words_df[column_name] = name_meaningful_words_df.name.str.split('\\W+').apply(lambda cur_word: any(value_to_search in cur_word for value_to_search in values_to_search))

name_meaningful_words_df = name_meaningful_words_df.drop("name", axis=1)
name_meaningful_words_df.head()

### Checking if city column needs any modifications

In [None]:
# There aren't too many cities, so the values will remain unmodified until they are one-hot encoded
name_meaningful_words_df.city.value_counts()

In [None]:
reduced_frequency_values_removed_df = name_meaningful_words_df.copy()
values_to_replace = reduced_frequency_values_removed_df.city.value_counts()[reduced_frequency_values_removed_df.city.value_counts() < 5000].index
reduced_frequency_values_removed_df.loc[reduced_frequency_values_removed_df.city.isin(values_to_replace), 'city'] = 'Other'
reduced_frequency_values_removed_df.city.value_counts()

In [None]:
# There are many values with a frequency of one, everything with a frequency < 1000 will be converted to 'Other'
name_meaningful_words_df.neighbourhood.value_counts()

In [None]:
values_to_replace = reduced_frequency_values_removed_df.neighbourhood.value_counts()[reduced_frequency_values_removed_df.neighbourhood.value_counts() < 1000].index
reduced_frequency_values_removed_df.loc[reduced_frequency_values_removed_df.neighbourhood.isin(values_to_replace), 'neighbourhood'] = 'Other'
reduced_frequency_values_removed_df.neighbourhood.value_counts()

### One-hot encoding

In [None]:
# One-hot encoding the room_type, city and neighbourhood columns
room_type_dummies = pd.get_dummies(reduced_frequency_values_removed_df.room_type, drop_first=True)
city_dummies = pd.get_dummies(reduced_frequency_values_removed_df.city, drop_first=True)
neighbourhood_dummies = pd.get_dummies(reduced_frequency_values_removed_df.neighbourhood, drop_first=True)
final_df = pd.concat([reduced_frequency_values_removed_df, room_type_dummies, neighbourhood_dummies, city_dummies], axis="columns")
final_df = final_df.drop(["neighbourhood", "room_type", "city"], axis=1)
final_df.head()

In [None]:
final_df = final_df.head(10000)

In [None]:
len(final_df)

In [None]:
final_df['price_per_night'] = pd.cut(final_df['price_per_night'], bins=[-1, 90, 150, 257, float('inf')],
                           labels=[0, 1, 2, 3])
final_df.head()

In [None]:
final_df.to_csv("df_full_size.csv")

In [None]:
len(final_df[final_df.has_nr_of_bedrooms])

In [None]:
final_df.to_csv("loosing_my_neurons.csv")

In [None]:
final_df[final_df.price_per_night1.isnull()]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVC

X = final_df.drop("price_per_night", axis=1)
y = final_df.price_per_night

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LogisticRegression(max_iter=10000))
    ],
    'params': {}
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVR

X = final_df.drop("price_per_night", axis=1)
y = final_df.price_per_night

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LinearRegression())
    ],
    'params': {}
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

## Testing ml models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVR

X = final_df.drop("price_per_night", axis=1)
y = final_df.price_per_night

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LinearRegression())
    ],
    'params': {}
  },
  'suppor_vector_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', SVR())
    ],
    'params': {
        'regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'regressor__C': [0.1, 10],
        'regressor__epsilon': [0.1, 0.001],
        'regressor__gamma': ['scale', 'auto']
    }
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df