In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


In [2]:
# Data Augmentation

import random
import datetime

def augment_data(df, num_augmented_samples=1000):
    augmented_data = []
    # Create new derived features
    current_year = datetime.datetime.now().year
    current_month = datetime.datetime.now().month
    # Define the month to number dictionary
    month_to_number = {
        'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
        'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
    }

    # Replace month names with corresponding month numbers and keep non-month name entries unchanged
    df['month_of_vehicle_manufacture'] = df['month_of_vehicle_manufacture'].apply(
        lambda x: month_to_number.get(x.lower(), x) if isinstance(x, str) else x
    )
    # Convert string month numbers to numeric month numbers
    df['month_of_vehicle_manufacture'] = pd.to_numeric(df['month_of_vehicle_manufacture'], errors='coerce')

    for _ in range(num_augmented_samples):
        sample = df.sample(n=1).iloc[0]

        # Randomly alter numerical features slightly
        new_sample = sample.copy()
        if not pd.isna(sample['odometer_reading']):
            new_sample['odometer_reading'] = sample['odometer_reading'] * random.uniform(0.9, 1.1)

        if not pd.isna(sample['month_of_vehicle_manufacture']):

            new_sample['month_of_vehicle_manufacture'] = min(12, max(1, sample['month_of_vehicle_manufacture'] + random.randint(-1, 1)))

        if not pd.isna(sample['year_of_vehicle_manufacture']):
            new_sample['year_of_vehicle_manufacture'] = min(current_year, max(1980, sample['year_of_vehicle_manufacture'] + random.randint(-1, 1)))

        # Add some noise to the 'car_valuation'
        if not pd.isna(sample['car_valuation']):
            new_sample['car_valuation'] = sample['car_valuation'] * random.uniform(0.95, 1.05)

        augmented_data.append(new_sample)

    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

# Load the train dataset
train_df = pd.read_csv('train.csv').rename(columns=str.lower)

# Augment the train dataset
augmented_train_df = augment_data(train_df, num_augmented_samples=5000)

# Save the augmented train dataset to a CSV file
augmented_train_df.to_csv('augmented_train1.csv', index=False)


In [3]:

# Load the train and test datasets
# train_df = pd.read_csv('train.csv').rename(columns=str.lower)
train_df = pd.read_csv('augmented_train1.csv').rename(columns=str.lower)
test_df = pd.read_csv('test.csv').rename(columns=str.lower)


In [4]:
# Define the month to number dictionary
month_to_number = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
    'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
}

# Replace month names with corresponding month numbers and keep non-month name entries unchanged
train_df['month_of_vehicle_manufacture'] = train_df['month_of_vehicle_manufacture'].apply(
    lambda x: month_to_number.get(x.lower(), x) if isinstance(x, str) else x
)
test_df['month_of_vehicle_manufacture'] = test_df['month_of_vehicle_manufacture'].apply(
    lambda x: month_to_number.get(x.lower(), x) if isinstance(x, str) else x
)

# Convert string month numbers to numeric month numbers
train_df['month_of_vehicle_manufacture'] = pd.to_numeric(train_df['month_of_vehicle_manufacture'], errors='coerce')
test_df['month_of_vehicle_manufacture'] = pd.to_numeric(test_df['month_of_vehicle_manufacture'], errors='coerce')


train_df['registered_color'].fillna('White', inplace=True)
test_df['registered_color'].fillna('White', inplace=True)
train_df['accidental_vehicle'].fillna(0, inplace=True)
test_df['accidental_vehicle'].fillna(0, inplace=True)



In [5]:
train_df['odometer_reading'].fillna(0, inplace=True)
test_df['odometer_reading'].fillna(0, inplace=True)

In [6]:

# Define lists for luxury car makes, variants, and models
luxury_makes = ['Mercedes-Benz', 'BMW', 'Audi', 'Lexus', 'Jaguar', 'Porsche', 'Land Rover']
luxury_variants = ['S-Class', '7-Series', 'A8', 'LS', 'XJ', 'Panamera', 'Range Rover']
luxury_models = ['S', '7', 'A8', 'LS', 'XJ', 'Panamera', 'Range Rover']

# Create a new feature 'is_luxury'
train_df['is_luxury'] = train_df.apply(lambda row: 1 if (row['vehicle_make'] in luxury_makes or 
                                                         any(variant in row['car_variant'] for variant in luxury_variants) or 
                                                         any(model in row['vehicle_model'] for model in luxury_models)) else 0, axis=1)
test_df['is_luxury'] = test_df.apply(lambda row: 1 if (row['vehicle_make'] in luxury_makes or 
                                                       any(variant in row['car_variant'] for variant in luxury_variants) or 
                                                       any(model in row['vehicle_model'] for model in luxury_models)) else 0, axis=1)

# Define lists for commercial vehicle makes and models
commercial_makes = ['Tata', 'Mahindra', 'Ashok Leyland', 'Force Motors']
commercial_models = ['Eeco', 'Bolero', 'Innova', 'Tavera', 'Sumo']

# Create a new feature 'is_commercial'
train_df['is_commercial'] = train_df.apply(lambda row: 1 if (row['vehicle_make'] in commercial_makes or 
                                                             any(model in row['vehicle_model'] for model in commercial_models)) else 0, axis=1)
test_df['is_commercial'] = test_df.apply(lambda row: 1 if (row['vehicle_make'] in commercial_makes or 
                                                           any(model in row['vehicle_model'] for model in commercial_models)) else 0, axis=1)

# Handle missing odometer readings by imputing with median
train_df['odometer_reading'] = train_df.groupby('vehicle_model')['odometer_reading'].transform(lambda x: x.fillna(x.median()))
test_df['odometer_reading'] = test_df.groupby('vehicle_model')['odometer_reading'].transform(lambda x: x.fillna(x.median()))

# Calculate vehicle age
current_year = 2024
current_month = 7  # Assuming current month is July for this example
train_df['vehicle_age'] = current_year - train_df['year_of_vehicle_manufacture'] + (current_month - train_df['month_of_vehicle_manufacture']) / 12
test_df['vehicle_age'] = current_year - test_df['year_of_vehicle_manufacture'] + (current_month - test_df['month_of_vehicle_manufacture']) / 12

# Calculate mileage per year
train_df['mileage_per_year'] = train_df['odometer_reading'] / train_df['vehicle_age']
test_df['mileage_per_year'] = test_df['odometer_reading'] / test_df['vehicle_age']

# Age at sale
train_df['age_at_sale'] = current_year - train_df['year_of_vehicle_manufacture']
test_df['age_at_sale'] = current_year - test_df['year_of_vehicle_manufacture']

# Is the car new?
train_df['is_new'] = (train_df['age_at_sale'] < 3).astype(int)
test_df['is_new'] = (test_df['age_at_sale'] < 3).astype(int)

# Define vehicle age thresholds for scrappage
private_vehicle_age_threshold = 20
commercial_vehicle_age_threshold = 15

# Create a new feature 'years_until_scrappage'
train_df['years_until_scrappage'] = train_df.apply(lambda row: 
    private_vehicle_age_threshold - row['vehicle_age'] if row['is_commercial'] == 0 
    else commercial_vehicle_age_threshold - row['vehicle_age'], axis=1)
test_df['years_until_scrappage'] = test_df.apply(lambda row: 
    private_vehicle_age_threshold - row['vehicle_age'] if row['is_commercial'] == 0 
    else commercial_vehicle_age_threshold - row['vehicle_age'], axis=1)

# Create a new feature 'scrappage_risk'
train_df['scrappage_risk'] = train_df['years_until_scrappage'].apply(lambda x: 1 if x <= 0 else 0)
test_df['scrappage_risk'] = test_df['years_until_scrappage'].apply(lambda x: 1 if x <= 0 else 0)

# Define city tiers
city_tiers = {
    'metro': ['Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Ahmedabad', 'Chennai', 'Kolkata'],
    'tier_1': ['Pune', 'Jaipur', 'Surat', 'Lucknow', 'Kanpur', 'Nagpur', 'Indore', 'Thane', 'Bhopal', 'Visakhapatnam', 'Patna', 'Vadodara', 'Ghaziabad'],
    'tier_2': ['Ludhiana', 'Agra', 'Nashik', 'Faridabad', 'Meerut', 'Rajkot', 'Kalyan-Dombivli', 'Vasai-Virar', 'Varanasi', 'Srinagar', 'Aurangabad', 'Dhanbad', 'Amritsar'],
    'other': []  # All other cities not listed above
}

def categorize_city(city):
    for category, cities in city_tiers.items():
        if city in cities:
            return category
    return 'other'

train_df['city_category'] = train_df['city'].apply(categorize_city)
test_df['city_category'] = test_df['city'].apply(categorize_city)

# Define a function to convert city categories to ordinal values
def city_category_to_ordinal(city_category):
    if city_category == 'metro':
        return 3
    elif city_category == 'tier_1':
        return 2
    elif city_category == 'tier_2':
        return 1
    else:  # 'other'
        return 0

# Apply the function to the 'city_category' column to create a new ordinal feature 'city_category_ordinal'
train_df['city_category'] = train_df['city_category'].apply(city_category_to_ordinal)
test_df['city_category'] = test_df['city_category'].apply(city_category_to_ordinal)


In [7]:
def round_floats(df, decimals=2):
    # Select the columns with float dtype
    float_cols = df.select_dtypes(include=['float64']).columns
    # Round off the float columns to the specified number of decimal places
    df[float_cols] = df[float_cols].round(decimals)
    return df

In [8]:
train_df = round_floats(train_df, 2)
test_df = round_floats(test_df, 2)

In [9]:
def clean_df(df):# Identify floating-point columns
    float_columns = df.select_dtypes(include=['float64']).columns

    # Check for infinite values and handle them
    for column in float_columns:
        if np.isinf(df[column]).any():
            print(f"Column '{column}' contains infinite values.")
            # Replace infinite values with NaN
            df[column].replace([np.inf, -np.inf], np.nan, inplace=True)
            # Fill NaN values with the mean of the column
            df[column].fillna(df[column].mean(), inplace=True)
    return df

train_df=clean_df(train_df)
test_df=clean_df(test_df)

Column 'mileage_per_year' contains infinite values.


In [10]:

# Select features for the model
features = [
    'car_variant', 'year_of_vehicle_manufacture', 'month_of_vehicle_manufacture',
    'odometer_reading', 'odometer_reading_present', 'vehicle_fuel_type', 'registered_color',
    'vehicle_make', 'vehicle_model', 'accidental_vehicle', 'city', 'is_luxury', 'is_commercial',
    'vehicle_age', 'mileage_per_year', 'age_at_sale', 'is_new', 'years_until_scrappage',
    'scrappage_risk', 'city_category'
]
target = 'car_valuation'

# Split features and target
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]

# Preprocessing for numerical data
numerical_features = ['year_of_vehicle_manufacture', 'month_of_vehicle_manufacture', 'odometer_reading',
                      'odometer_reading_present', 'vehicle_age', 'mileage_per_year', 'age_at_sale',
                      'is_new', 'years_until_scrappage', 'scrappage_risk', 'city_category']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_features = ['car_variant', 'vehicle_fuel_type', 'registered_color', 'vehicle_make', 'vehicle_model', 'accidental_vehicle', 'city']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestRegressor(n_estimators=500, random_state=0))])


In [11]:

# Train the model
model.fit(X_train, y_train)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year_of_vehicle_manufacture',
                                                   'month_of_vehicle_manufacture',
                                                   'odometer_reading',
                                                   'odometer_reading_present',
                                                   'vehicle_age',
                                                   'mileage_per_year',
                                                   'age_at_sale', 'is_new',
                                     

In [12]:

# Make predictions on the test set
predictions = model.predict(X_test)

# Create a DataFrame to store the predictions
predictions_df = test_df[['id']].copy()
predictions_df['car_valuation'] = predictions

# Save the predictions to a CSV file
predictions_df.to_csv('test_predictions_500_new.csv', index=False)

In [None]:
# # Preprocessing for categorical data
# categorical_features = ['car_variant', 'vehicle_fuel_type', 'registered_color', 'vehicle_make', 'vehicle_model', 'accidental_vehicle', 'city']
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ])

# # Define the model
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', RandomForestRegressor(random_state=0))])

# # Define parameter grid for GridSearchCV
# param_grid = {
#     'classifier__n_estimators': [100, 200, 300, 400, 500],
#     'classifier__max_depth': [None, 10, 20, 30, 40, 50, 60]
# }

# # Initialize GridSearchCV
# grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)

# # Fit GridSearchCV to the training data
# grid_search.fit(X_train, y_train)

# # Print best parameters and best score
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best score: {grid_search.best_score_}")

# # Make predictions on the test set using the best estimator
# best_model = grid_search.best_estimator_
# predictions = best_model.predict(X_test)

# # Create a DataFrame to store the predictions
# predictions_df = test_df[['id']].copy()
# predictions_df['car_valuation'] = predictions


In [None]:

# # Save the predictions to a CSV file
# predictions_df.to_csv('test_predictions_best_new.csv', index=False)

In [None]:

# # Make predictions on the test set
# predictions_best = best_model.predict(X_test)

# # Create a DataFrame to store the predictions
# predictions_best_df = test_df[['id']].copy()
# predictions_best_df['car_valuation'] = predictions_best

# # Save the predictions to a CSV file
# # predictions_df.to_csv('test_predictions.csv', index=False)

In [None]:
predictions_df