In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy import stats
import statsmodels.api as sm

# Data Description

In [None]:
# Load datasets
rate_df = pd.read_csv('new_rate.csv')

In [None]:
# Display basic information
rate_df.info()

In [None]:
rate_df.shape

In [None]:
# Display summary statistics
rate_df.describe()

In [None]:
# Check for missing values
missing_values = rate_df.isnull().sum()
missing_values

In [None]:
# Count total number of rows
total_rows = rate_df.shape[0]
print(f"Total number of rows: {total_rows}")

In [None]:
rate_df = rate_df.sample(n=200000, random_state=42)

In [None]:
rate_df.shape

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = rate_df.isnull().mean() * 100
missing_percentage

# Data Cleaning 

In [None]:
# Plot missing values
plt.figure(figsize=(12, 6))
sns.heatmap(rate_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Drop columns with more than 50% missing values
columns_to_drop = missing_percentage[missing_percentage > 70].index
rate_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# List of columns to drop
columns_to_drop = [
    'ImportDate',
    'FederalTIN'
]

In [None]:
# Drop the specified columns
rate_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = rate_df.isnull().mean() * 100
missing_percentage

# Data Preprocessing 

# Bussiness Year Handling

In [None]:
# Replace BusinessYear values
rate_df['BusinessYear'] = rate_df['BusinessYear'].replace({2014: 2021, 2015: 2022, 2016: 2023})

# Age Data Handling

In [None]:
# 1. Remove rows where Age is "Family Option"
rate_df = rate_df[rate_df['Age'] != 'Family Option']

In [None]:
# Replace '0-20' with 18 in the Age column
rate_df['Age'] = rate_df['Age'].replace('0-20', 18)

In [None]:
# Replace '65 and over' with a distributed value from 66 to 70
rate_df['Age'] = rate_df['Age'].apply(lambda x: np.random.choice([66, 67, 68, 69, 70]) if x == '65 and over' else x)
rate_df['Age'] = pd.to_numeric(rate_df['Age'], errors='coerce')

# Handling the IndividualTabaccoRate

In [None]:
# Fill missing values in the 'IndividualTobaccoRate' column with 0
rate_df['IndividualTobaccoRate'].fillna(0, inplace=True)

In [None]:
# Verify that missing values have been filled
print(rate_df['IndividualTobaccoRate'].isnull().sum())

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = rate_df.isnull().mean() * 100
missing_percentage

# Handling RatingAreaID 

In [None]:
# Remove 'Rating Area ' prefix in RatingAreaId
rate_df['RatingAreaId'] = rate_df['RatingAreaId'].str.replace('Rating Area ', '', regex=False)
rate_df['RatingAreaId'] = pd.to_numeric(rate_df['RatingAreaId'], errors='coerce')

# Find the OutLiers in IndividualRate Adding Premium Column By Adding IR and ITR

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = rate_df['IndividualRate'].quantile(0.25)
Q3 = rate_df['IndividualRate'].quantile(0.75)

# Calculate IQR (Interquartile Range)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = rate_df[(rate_df['IndividualRate'] < lower_bound) | (rate_df['IndividualRate'] > upper_bound)]

print(f"Number of outliers detected: {len(outliers)}")
print(f"Lower bound for outliers: {lower_bound}")
print(f"Upper bound for outliers: {upper_bound}")

In [None]:
# Remove outliers
rate_df = rate_df[(rate_df['IndividualRate'] >= lower_bound) & (rate_df['IndividualRate'] <= upper_bound)]

In [None]:
# Add 'IndividualRate' and 'IndividualTobaccoRate' to create 'Premium'
rate_df['Premium'] = rate_df['IndividualRate'] + rate_df['IndividualTobaccoRate']

In [None]:
# Drop the original 'IndividualRate' and 'IndividualTobaccoRate' columns
rate_df.drop(columns=['IndividualRate', 'IndividualTobaccoRate'], inplace=True)

# Date Data Handling For Import, RateEffective , RateExpiration

In [None]:
# Date Handling
rate_df['RateEffectiveDate'] = pd.to_datetime(rate_df['RateEffectiveDate'], format='%Y-%m-%d')
rate_df['RateExpirationDate'] = pd.to_datetime(rate_df['RateExpirationDate'], format='%Y-%m-%d')

In [None]:
# Update the year in the dates
def update_year(date):
    if date.year == 2013:
        return date.replace(year=2020)
    elif date.year == 2014:
        return date.replace(year=2021)
    elif date.year == 2015:
        return date.replace(year=2022)
    else:
        return date

In [None]:
rate_df['RateEffectiveDate'] = rate_df['RateEffectiveDate'].apply(update_year)
rate_df['RateExpirationDate'] = rate_df['RateExpirationDate'].apply(update_year)

In [None]:
# Remove rows where the date is None
rate_df = rate_df.dropna(subset=['RateEffectiveDate', 'RateExpirationDate'])

In [None]:
# Calculate RateDuration
rate_df['RateDuration'] = (rate_df['RateExpirationDate'] - rate_df['RateEffectiveDate']).dt.days

In [None]:
# Drop the original 'RateEffectiveDate' and 'RateExpirationDate' columns
rate_df.drop(columns=['RateEffectiveDate', 'RateExpirationDate'], inplace=True)

In [None]:
# Identify columns with only NaN values and drop them
columns_with_all_nan = rate_df.columns[rate_df.isna().all()]
rate_df.drop(columns=columns_with_all_nan, inplace=True)

In [None]:
# Find the NAN Values 

In [None]:
# Drop rows where 'Tobacco' column has NaN values
rate_df = rate_df.dropna(subset=['Tobacco'])

In [None]:
# Identify columns with only NaN values
columns_with_all_nan = rate_df.columns[rate_df.isna().all()]

# Display the columns
print(columns_with_all_nan)

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = rate_df.isnull().mean() * 100
missing_percentage

In [None]:
rate_df

# Handling Categorical Data

In [None]:
unique_state_codes = rate_df['StateCode'].unique()
print(f"Unique values in StateCode: {unique_state_codes}")
print(f"Number of unique StateCodes: {len(unique_state_codes)}")

In [None]:
unique_tobacco = rate_df['Tobacco'].unique()
print(f"Unique values in Tobacco: {unique_tobacco}")
print(f"Number of unique Tobacco values: {len(unique_tobacco)}")

In [None]:
unique_plan_ids = rate_df['PlanId'].unique()
print(f"Unique values in PlanId: {unique_plan_ids}")
print(f"Number of unique PlanIds: {len(unique_plan_ids)}")

In [None]:
unique_source_names = rate_df['SourceName'].unique()
print(f"Unique values in SourceName: {unique_source_names}")
print(f"Number of unique SourceNames: {len(unique_source_names)}")

In [None]:
print("Unique value counts for each column:")
print(f"StateCode: {rate_df['StateCode'].nunique()}")
print(f"Tobacco: {rate_df['Tobacco'].nunique()}")
print(f"PlanId: {rate_df['PlanId'].nunique()}")
print(f"SourceName: {rate_df['SourceName'].nunique()}")

In [None]:
# Handling Categorical Data
label_encoder = LabelEncoder()

# Apply Label Encoding to the selected columns
rate_df['StateCode'] = label_encoder.fit_transform(rate_df['StateCode'])
rate_df['Tobacco'] = label_encoder.fit_transform(rate_df['Tobacco'])
rate_df['PlanId'] = label_encoder.fit_transform(rate_df['PlanId'])
rate_df['SourceName'] = label_encoder.fit_transform(rate_df['SourceName'])

In [None]:
# Scale numerical features
numerical_features = ['Age', 'Premium', 'RateDuration']
scaler = StandardScaler()
rate_df[numerical_features] = scaler.fit_transform(rate_df[numerical_features])

In [None]:
# Create an interaction term between 'Age' and 'RateDuration'
rate_df['Age_RateDuration_Interaction'] = rate_df['Age'] * rate_df['RateDuration']

In [None]:
# Final DataFrame check
rate_df.head()
rate_df.dtypes

In [None]:
# EDA: Additional Plots

# 1. Histogram of Numerical Features
plt.figure(figsize=(12, 6))
for i, feature in enumerate(numerical_features + ['Age_RateDuration_Interaction']):
    plt.subplot(2, 3, i+1)
    sns.histplot(rate_df[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# 2. Correlation Heatmap
plt.figure(figsize=(10, 8))
corr = rate_df[numerical_features + ['Age_RateDuration_Interaction']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 3. Pairplot of Selected Features
selected_features = ['Age', 'Premium', 'RateDuration', 'Age_RateDuration_Interaction']
sns.pairplot(rate_df[selected_features])
plt.title('Pairplot of Selected Features')
plt.show()

In [None]:
# 4. Boxplot of Premium by StateCode
plt.figure(figsize=(12, 6))
sns.boxplot(x='StateCode', y='Premium', data=rate_df)
plt.title('Boxplot of Premium by StateCode')
plt.xlabel('State Code')
plt.ylabel('Premium')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 5. Countplot of Tobacco Usage
plt.figure(figsize=(8, 6))
sns.countplot(x='Tobacco', data=rate_df)
plt.title('Countplot of Tobacco Usage')
plt.xlabel('Tobacco Usage')
plt.ylabel('Count')
plt.show()

In [None]:
# 6. Scatterplot of Age vs. Premium
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Premium', data=rate_df)
plt.title('Scatterplot of Age vs. Premium')
plt.xlabel('Age')
plt.ylabel('Premium')
plt.show()

# Demo Model 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

In [None]:
# Feature selection (dropping columns that won't be used in the model)
features = ['StateCode', 'SourceName', 'Age', 'Tobacco', 'RateDuration', 'Age_RateDuration_Interaction']
X = rate_df[features]
y = rate_df['Premium']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize a dictionary to store model scores
model_scores = {}

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_lr = linear_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

model_scores['Linear Regression'] = {
    'MSE': mse_lr,
    'RMSE': rmse_lr,
    'MAE': mae_lr,
    'R2 Score': r2_lr
}

In [None]:
# Lasso Regression
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

model_scores['Lasso Regression'] = {
    'MSE': mse_lasso,
    'RMSE': rmse_lasso,
    'MAE': mae_lasso,
    'R2 Score': r2_lasso
}

In [None]:
# Ridge Regression
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

model_scores['Ridge Regression'] = {
    'MSE': mse_ridge,
    'RMSE': rmse_ridge,
    'MAE': mae_ridge,
    'R2 Score': r2_ridge
}

In [None]:
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

model_scores['Random Forest'] = {
    'MSE': mse_rf,
    'RMSE': rmse_rf,
    'MAE': mae_rf,
    'R2 Score': r2_rf
}

In [None]:
# Gradient Boosting Regression
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

model_scores['Gradient Boosting'] = {
    'MSE': mse_gb,
    'RMSE': rmse_gb,
    'MAE': mae_gb,
    'R2 Score': r2_gb
}

In [None]:
# KNN Regression
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

model_scores['KNN Regression'] = {
    'MSE': mse_knn,
    'RMSE': rmse_knn,
    'MAE': mae_knn,
    'R2 Score': r2_knn
}

In [None]:
# Display the results
for model_name, scores in model_scores.items():
    print(f"{model_name}: {scores}")

In [None]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_


In [None]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, cv=5, scoring='r2')
gb_grid_search.fit(X_train, y_train)
best_gb_model = gb_grid_search.best_estimator_


In [None]:
final_model = best_gb_model
final_model.fit(X_train, y_train)
y_final_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_final_pred))
final_r2 = r2_score(y_test, y_final_pred)

print("\nFinal Model Performance:")
print(f"RMSE: {final_rmse:.2f}")
print(f"R²: {final_r2:.2f}")


In [None]:
# Plot model performance
results_df = pd.DataFrame(model_scores).T
results_df.plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Feature Importance using Random Forest
importances = rf_model.feature_importances_
feature_names = features
sorted_indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X.shape[1]), np.array(feature_names)[sorted_indices], rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()

In [None]:
# Residual Plot for Random Forest
plt.figure(figsize=(10, 6))
residuals = y_test - y_pred_rf
sns.scatterplot(x=y_pred_rf, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Plot for Random Forest')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Save the model and label encoders
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('label_encoder_state.pkl', 'wb') as file:
    pickle.dump(label_encoder_state, file)

with open('label_encoder_source.pkl', 'wb') as file:
    pickle.dump(label_encoder_source, file)

with open('label_encoder_tobacco.pkl', 'wb') as file:
    pickle.dump(label_encoder_tobacco, file)

In [None]:
from sklearn.model_selection import train_test_split

# Define features and target
X = rate_df.drop('Premium', axis=1)  # Assuming 'Premium' is the target variable
y = rate_df['Premium']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - RMSE: {rmse_lr}, MAE: {mae_lr}, R2: {r2_lr}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regression - RMSE: {rmse_rf}, MAE: {mae_rf}, R2: {r2_rf}")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and train the model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting Regression - RMSE: {rmse_gb}, MAE: {mae_gb}, R2: {r2_gb}")

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and train the model
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_model.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print(f"K-Nearest Neighbors Regression - RMSE: {rmse_knn}, MAE: {mae_knn}, R2: {r2_knn}")