In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Load the Dataset ---
# We start by loading the CSV file into a pandas DataFrame.
# A DataFrame is like a table that holds your data.
df = pd.read_csv('rental_data.csv')

# --- Step 2: Convert Date Columns ---
# Your date columns are currently text. We need to convert them to a special
# datetime format so we can work with them properly (e.g., calculate rental durations).
df['CheckOut_Date'] = pd.to_datetime(df['CheckOut_Date'])
df['Planned_Return_Date'] = pd.to_datetime(df['Planned_Return_Date'])
df['CheckIn_Date'] = pd.to_datetime(df['CheckIn_Date'])

# --- Step 3: Initial Data Inspection ---
# Let's get a quick overview of our data.
# .info() gives us a summary of the columns, their data types, and if there are missing values.
print("--- Data Info ---")
print(df.info())

# .describe() gives us statistical information (like mean, standard deviation, etc.)
# for all the numerical columns. This helps us spot any unusual ranges or outliers.
print("\n--- Numerical Data Summary ---")
print(df.describe())


# --- Step 4: Visualize Numerical Data ---
# A picture is worth a thousand words! Let's create some plots.
# We'll use a style for our plots to make them look nice.
sns.set_style("whitegrid")

# Histogram of Operating Hours
plt.figure(figsize=(10, 6))
sns.histplot(df['Operating_Hours'], kde=True, bins=30)
plt.title('Distribution of Operating Hours')
plt.xlabel('Operating Hours')
plt.ylabel('Frequency')
plt.savefig('operating_hours_distribution.png')
plt.close() # Close the plot to free up memory

# Histogram of Fuel Consumed
plt.figure(figsize=(10, 6))
sns.histplot(df['Fuel_Consumed_Liters'], kde=True, bins=30, color='orange')
plt.title('Distribution of Fuel Consumed (Liters)')
plt.xlabel('Fuel Consumed (Liters)')
plt.ylabel('Frequency')
plt.savefig('fuel_consumed_distribution.png')
plt.close()

# Histogram of Rental Cost
plt.figure(figsize=(10, 6))
sns.histplot(df['Rental_Cost_USD'], kde=True, bins=30, color='green')
plt.title('Distribution of Rental Cost (USD)')
plt.xlabel('Rental Cost (USD)')
plt.ylabel('Frequency')
plt.savefig('rental_cost_distribution.png')
plt.close()

# --- Step 5: Visualize Categorical Data ---
# Now let's look at the non-numerical data.

# Bar Chart for Equipment Types
plt.figure(figsize=(10, 6))
sns.countplot(y=df['Type'], order = df['Type'].value_counts().index)
plt.title('Count of Equipment by Type')
plt.xlabel('Count')
plt.ylabel('Equipment Type')
plt.tight_layout()
plt.savefig('equipment_type_count.png')
plt.close()

# Bar Chart for Rental Status
plt.figure(figsize=(10, 6))
sns.countplot(x=df['Rental_Status'], order = df['Rental_Status'].value_counts().index)
plt.title('Count of Rentals by Status')
plt.xlabel('Rental Status')
plt.ylabel('Count')
plt.savefig('rental_status_count.png')
plt.close()

# Bar Chart for GPS Location
plt.figure(figsize=(10, 6))
sns.countplot(y=df['GPS_Location'], order = df['GPS_Location'].value_counts().index)
plt.title('Count of Equipment by GPS Location')
plt.xlabel('Count')
plt.ylabel('GPS Location')
plt.tight_layout()
plt.savefig('equipment_location_count.png')
plt.close()

print("\nEDA complete! I have generated several plots as PNG files.")
print("The files are: operating_hours_distribution.png, fuel_consumed_distribution.png, rental_cost_distribution.png, equipment_type_count.png, rental_status_count.png, equipment_location_count.png")

In [2]:
import pandas as pd
import numpy as np
import os

# --- Step 1: Load and Prepare the Data ---
# We'll reload it here to ensure this cell can run independently
df = pd.read_csv('rental_data.csv')
df['CheckOut_Date'] = pd.to_datetime(df['CheckOut_Date'])
df['Planned_Return_Date'] = pd.to_datetime(df['Planned_Return_Date'])
df['CheckIn_Date'] = pd.to_datetime(df['CheckIn_Date'])


# --- Step 2: Create New Features (Feature Engineering) ---
df['Rental_Duration_Days'] = (df['CheckIn_Date'] - df['CheckOut_Date']).dt.days
df['Planned_Duration_Days'] = (df['Planned_Return_Date'] - df['CheckOut_Date']).dt.days
df['Overdue_Days'] = df['Rental_Duration_Days'] - df['Planned_Duration_Days']
df['Overdue_Days'] = df['Overdue_Days'].apply(lambda x: max(x, 0))
df['Equipment_Age_Years'] = df['CheckOut_Date'].dt.year - df['Manufacture_Year']
df['Utilization_Rate'] = df['Operating_Hours'] / (df['Operating_Hours'] + df['Idle_Hours'] + 1e-6)


# --- Step 3: Validate the New Features ---
print("--- DataFrame with New Features (First 5 Rows) ---")
print(df[['Equipment_ID', 'Rental_Duration_Days', 'Planned_Duration_Days', 'Overdue_Days', 'Equipment_Age_Years', 'Utilization_Rate']].head())


# --- Step 4: Save the Processed Data ---
processed_file_path = '../data/processed/rental_data_clean.csv'
output_dir = os.path.dirname(processed_file_path)
os.makedirs(output_dir, exist_ok=True)
df.to_csv(processed_file_path, index=False)

print(f"\nSuccessfully created 5 new features!")
print(f"The processed data has been saved to: {processed_file_path}")

--- DataFrame with New Features (First 5 Rows) ---
  Equipment_ID  Rental_Duration_Days  Planned_Duration_Days  Overdue_Days  \
0       EQ0001                     4                      6             0   
1       EQ0002                    15                     13             2   
2       EQ0003                    14                     14             0   
3       EQ0004                    14                     14             0   
4       EQ0005                    11                     11             0   

   Equipment_Age_Years  Utilization_Rate  
0                    2          0.716418  
1                    3          0.785185  
2                    7          0.847458  
3                    6          0.811594  
4                    7          0.738636  

Successfully created 5 new features!
The processed data has been saved to: ../data/processed/rental_data_clean.csv


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Load the Dataset ---
# We start by loading the CSV file into a pandas DataFrame.
# A DataFrame is like a table that holds your data.
df = pd.read_csv('rental_data.csv')

# --- Step 2: Convert Date Columns ---
# Your date columns are currently text. We need to convert them to a special
# datetime format so we can work with them properly (e.g., calculate rental durations).
df['CheckOut_Date'] = pd.to_datetime(df['CheckOut_Date'])
df['Planned_Return_Date'] = pd.to_datetime(df['Planned_Return_Date'])
df['CheckIn_Date'] = pd.to_datetime(df['CheckIn_Date'])

# --- Step 3: Initial Data Inspection ---
# Let's get a quick overview of our data.
# .info() gives us a summary of the columns, their data types, and if there are missing values.
print("--- Data Info ---")
print(df.info())

# .describe() gives us statistical information (like mean, standard deviation, etc.)
# for all the numerical columns. This helps us spot any unusual ranges or outliers.
print("\n--- Numerical Data Summary ---")
print(df.describe())


# --- Step 4: Visualize Numerical Data ---
# A picture is worth a thousand words! Let's create some plots.
# We'll use a style for our plots to make them look nice.
sns.set_style("whitegrid")

# Histogram of Operating Hours
plt.figure(figsize=(10, 6))
sns.histplot(df['Operating_Hours'], kde=True, bins=30)
plt.title('Distribution of Operating Hours')
plt.xlabel('Operating Hours')
plt.ylabel('Frequency')
plt.savefig('operating_hours_distribution.png')
plt.close() # Close the plot to free up memory

# Histogram of Fuel Consumed
plt.figure(figsize=(10, 6))
sns.histplot(df['Fuel_Consumed_Liters'], kde=True, bins=30, color='orange')
plt.title('Distribution of Fuel Consumed (Liters)')
plt.xlabel('Fuel Consumed (Liters)')
plt.ylabel('Frequency')
plt.savefig('fuel_consumed_distribution.png')
plt.close()

# Histogram of Rental Cost
plt.figure(figsize=(10, 6))
sns.histplot(df['Rental_Cost_USD'], kde=True, bins=30, color='green')
plt.title('Distribution of Rental Cost (USD)')
plt.xlabel('Rental Cost (USD)')
plt.ylabel('Frequency')
plt.savefig('rental_cost_distribution.png')
plt.close()

# --- Step 5: Visualize Categorical Data ---
# Now let's look at the non-numerical data.

# Bar Chart for Equipment Types
plt.figure(figsize=(10, 6))
sns.countplot(y=df['Type'], order = df['Type'].value_counts().index)
plt.title('Count of Equipment by Type')
plt.xlabel('Count')
plt.ylabel('Equipment Type')
plt.tight_layout()
plt.savefig('equipment_type_count.png')
plt.close()

# Bar Chart for Rental Status
plt.figure(figsize=(10, 6))
sns.countplot(x=df['Rental_Status'], order = df['Rental_Status'].value_counts().index)
plt.title('Count of Rentals by Status')
plt.xlabel('Rental Status')
plt.ylabel('Count')
plt.savefig('rental_status_count.png')
plt.close()

# Bar Chart for GPS Location
plt.figure(figsize=(10, 6))
sns.countplot(y=df['GPS_Location'], order = df['GPS_Location'].value_counts().index)
plt.title('Count of Equipment by GPS Location')
plt.xlabel('Count')
plt.ylabel('GPS Location')
plt.tight_layout()
plt.savefig('equipment_location_count.png')
plt.close()

print("\nEDA complete! I have generated several plots as PNG files.")
print("The files are: operating_hours_distribution.png, fuel_consumed_distribution.png, rental_cost_distribution.png, equipment_type_count.png, rental_status_count.png, equipment_location_count.png")

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Equipment_ID              5000 non-null   object        
 1   Type                      5000 non-null   object        
 2   Model                     5000 non-null   object        
 3   Manufacture_Year          5000 non-null   int64         
 4   Customer_ID               5000 non-null   object        
 5   CheckOut_Date             5000 non-null   datetime64[ns]
 6   Planned_Return_Date       5000 non-null   datetime64[ns]
 7   CheckIn_Date              5000 non-null   datetime64[ns]
 8   Rental_Status             5000 non-null   object        
 9   Operating_Hours           5000 non-null   int64         
 10  Idle_Hours                5000 non-null   int64         
 11  Fuel_Consumed_Liters      5000 non-null   float64       
 12  Fu