In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# Load data with the updated file paths
train_data = pd.read_csv('../data/raw/train/ticdata2000.txt', sep='\t', header=None)
eval_data = pd.read_csv('../data/raw/eval/ticeval2000.txt', sep='\t', header=None)
target_data = pd.read_csv('../data/raw/eval/tictgts2000.txt', sep='\t', header=None, names=['Target'])


In [None]:
missing_values = train_data.isnull().sum()
missing_values

In [None]:

import re

# Load the content of the dictionary.txt file
with open('../docs/insurance+company+benchmark+coil+2000/dictionary.txt', 'r', encoding='ISO-8859-1') as file:
    file_content = file.read()


# Extract the Data Dictionary table using regular expressions
pattern = re.compile(r"(\d+)\s+([A-Z]+[A-Z0-9]*)\s+(.+?)(?=\d+\s+|L0:)", re.DOTALL)
matches = pattern.findall(file_content)

# Create a DataFrame from the matches
df = pd.DataFrame(matches, columns=['Nr', 'Name', 'Description'])

# Clean up the 'Description' column to remove extra line breaks and spaces
df['Description'] = df['Description'].str.replace('\n', ' ').str.strip()

# Convert 'Nr' column to integer
df['Nr'] = df['Nr'].astype(int)

# Display the DataFrame
df


In [5]:

# Define a pattern to extract each L table (L0, L1, L2, L3, L4)
l_tables_pattern = re.compile(r"(L\d+):\n\n(.*?)\n\n", re.DOTALL)
l_tables_matches = l_tables_pattern.findall(file_content)

# Dictionary to store the DataFrames
l_tables_dict = {}

# Process each L table
for table_name, table_content in l_tables_matches:
    # Split the table content into lines
    lines = table_content.strip().split('\n')
    
    # Split each line into two parts: value and label
    data = [line.split(maxsplit=1) for line in lines if len(line.split(maxsplit=1)) == 2]
    
    # Create a DataFrame and store it in the dictionary
    df1 = pd.DataFrame(data, columns=['Value', 'Label'])
    l_tables_dict[table_name] = df1

# Display the dictionary keys to confirm extraction
l_tables_dict.keys()
l_tables_dict['L0'] = l_tables_dict['L0'].iloc[1:].reset_index(drop=True)


In [None]:
# Create a list of new column names using the 'Name' column from df
new_column_names = df['Name'].tolist()

# Rename the columns in train_data using the new_column_names list
train_data.columns = new_column_names

# Display the updated DataFrame
train_data.head()


In [None]:
# Step 1: Create a mapping dictionary for each L table
l_mapping_dict = {}

for l_name, df in l_tables_dict.items():
    # Create a dictionary mapping 'Value' to 'Label'
    mapping = dict(zip(df['Value'], df['Label']))
    l_mapping_dict[l_name] = mapping

# Step 2: Define a function to replace values using the appropriate mapping
def replace_values(df, column, mapping_dict):
    """
    Replace values in a DataFrame column using a mapping dictionary.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to replace.
        column (str): The column name to replace values for.
        mapping_dict (dict): The dictionary to use for replacement.
        
    Returns:
        pd.Series: The column with replaced values.
    """
    return df[column].map(mapping_dict)

column_to_mapping = {
    'MOSTYPE': 'L0',
    'MGEMLEEF': 'L1',
    'MOSHOOFD': 'L2',
    'MGODRK': 'L3',
    'MGODPR': 'L3',
    'MGODOV': 'L3',
    'MGODGE': 'L3',
    'PWAPART': 'L4',
    'PWABEDR': 'L4',
    'PWALAND': 'L4',
}
# Step 3: Convert only categorical columns to strings and update mapping dictionaries
categorical_columns = list(column_to_mapping.keys())

# Convert only the specified categorical columns to strings
for column in categorical_columns:
    if column in train_data.columns:
        train_data[column] = train_data[column].astype(str)

# Ensure mapping dictionary keys are strings
for l_table, mapping in l_mapping_dict.items():
    l_mapping_dict[l_table] = {str(k): v for k, v in mapping.items()}

# Step 4: Replace values for the relevant columns in the dataset
# Mapping the columns to their corresponding L tables from the Data Dictionary
for column, l_table in column_to_mapping.items():
    if column in train_data.columns:  # Ensure the column exists in the dataset
        train_data[column] = replace_values(train_data, column, l_mapping_dict[l_table])

# Step 5: Verify the replacement
train_data.head()


In [None]:
train_data.shape

In [None]:
train_data['CARAVAN'].value_counts()

In [None]:
# EDA - Distribution of the target variable
plt.figure(figsize=(8, 4))
sns.countplot(train_data['CARAVAN'])
plt.title('Distribution of Caravan Insurance Policy (Target Variable)')
plt.xlabel('Has Caravan Insurance')
plt.ylabel('Count')
plt.show()

In [None]:
# Correlation analysis
# Checking the correlation between features and the target
correlation = train_data.corr()['CARAVAN'].sort_values(ascending=False)
print("\nTop 10 features positively correlated with CARAVAN:")
print(correlation.head(11)) # Including 'CARAVAN' itself

print("\nTop 10 features negatively correlated with CARAVAN:")
print(correlation.tail(11))

In [None]:
# Correlation matrix for the first 43 columns and the CARAVAN column
plt.figure(figsize=(18, 14))

subset_corr = train_data.iloc[:, :43].join(train_data['CARAVAN']).corr()

sns.heatmap(subset_corr, cmap='coolwarm', annot=False, fmt=".2f")
plt.title('Correlation Matrix of the First 43 Attributes and CARAVAN')
plt.show()

In [None]:
# Correlation matrix for the last 43 columns and the CARAVAN column
plt.figure(figsize=(18, 14))

subset_corr_last = train_data.iloc[:, -44:].corr()  # Includes the last 43 columns and CARAVAN

sns.heatmap(subset_corr_last, cmap='coolwarm', annot=False, fmt=".2f")
plt.title('Correlation Matrix of the Last 43 Attributes and CARAVAN')
plt.show()


In [None]:
# Display statistical summary for numerical features
summary_stats = train_data.describe()
print("Statistical Summary:")
print(summary_stats)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for all numerical features
train_data.hist(bins=20, figsize=(20, 15))
plt.suptitle('Histograms of Numerical Features')
plt.show()

# Compare distributions of features for customers with and without a caravan policy
for col in train_data.columns:
    if col != 'CARAVAN':  # Assuming 'CARAVAN' is the target column
        plt.figure(figsize=(10, 6))
        sns.histplot(data=train_data, x=col, hue='CARAVAN', multiple='stack', bins=20)
        plt.title(f'Distribution of {col} for Caravan and Non-Caravan Customers')
        plt.show()


In [None]:
# Use boxplots to identify outliers in numerical features
for col in train_data.columns:
    if train_data[col].dtype != 'object' and col != 'CARAVAN':
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=train_data[col])
        plt.title(f'Boxplot of {col}')
        plt.show()


In [None]:
# Frequency analysis for categorical features
categorical_columns = [col for col in train_data.columns if train_data[col].dtype == 'object']
for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=train_data[col])
    plt.title(f'Frequency of {col}')
    plt.xticks(rotation=45)
    plt.show()

from IPython.display import display

# Cross-tabulation with the target variable
for col in categorical_columns:
    cross_tab = pd.crosstab(train_data[col], train_data['CARAVAN'])
    print(f'Cross-tabulation for {col} with CARAVAN:')
    display(cross_tab)  # Display the crosstab as a table


In [None]:
train_data

In [None]:
print("Data shape:", train_data.shape)


In [35]:
train_data = pd.read_csv('../docs/insurance+company+benchmark+coil+2000/ticdata2000.txt', sep='\t', header=None)

In [32]:
target_column = 'CARAVAN' 

In [None]:
#DO WE REALLY NEED THIS ???
# REPEAT WITH eval_data AND target_data ???

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# Step 1: Separate features and target
X = train_data.drop(columns=[85])  # Drop the target column
y = train_data[85]  # Column 85 represents the "CARAVAN" target


# Step 2: Identify categorical and numerical features
categorical_features = list(range(43))  # Columns 0-42 are considered categorical
numerical_features = list(range(43, 85))  # Columns 43-84 are considered numerical

# Step 3: Preprocessing pipelines for numerical and categorical data
numerical_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median / not needed bc there are no missing values 
    ('scaler', StandardScaler())  # Normalize numerical features
])

categorical_pipeline = Pipeline([
    # ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute with most frequent value / not needed bc there are no missing values 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Step 4: Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Step 5: Dimensionality reduction using TruncatedSVD
svd = TruncatedSVD(n_components=50)

# Step 6: Complete pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('reduce_dim', svd)
])

# Apply the pipeline to the training data
X_processed = pipeline.fit_transform(X)

# Display the shape of the processed data
print("Shape of the processed data:", X_processed.shape)
