<a href="https://colab.research.google.com/github/fjadidi2001/Artificial_Intelligence_Learning/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

# Upload the file
uploaded = files.upload()

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('telematics_syn.csv')

# Print the shape of the dataset
print("Shape of the dataset:", df.shape)

In [None]:
# Adjust display options to show all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping

# Display the first few rows of the dataframe
print(df.head())

In [None]:
print("Summary statistics:\n", df.describe())

In [None]:
print("Missing values:\n", df.isnull().sum())

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming df is your DataFrame
missing_values = df.isnull().sum()

# Plotting the missing values
missing_values.plot(kind='bar', figsize=(15, 6))
plt.title('Missing Values in Dataset')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Categorical columns
categorical_columns = ['Insured.sex', 'Marital', 'Car.use', 'Region']

# Subsample the data if necessary (e.g., 10% of the data)
sample_size = int(len(df) * 0.1)
df_sample = df.sample(n=sample_size, random_state=42)

# Plot histograms for numerical columns
for column in numerical_columns[:5]:  # Limit to first 5 numerical columns
    plt.figure(figsize=(10, 4))
    sns.histplot(df_sample[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Plot box plots for numerical columns
for column in numerical_columns[:5]:  # Limit to first 5 numerical columns
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=df_sample[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()

# Plot bar plots for categorical columns
for column in categorical_columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=df_sample[column], order=df_sample[column].value_counts().index)
    plt.title(f'Bar Plot of {column}')
    plt.xlabel('Frequency')
    plt.ylabel(column)
    plt.show()

# Plot box plots for numerical columns grouped by categorical columns
for num_col in numerical_columns[:5]:  # Limit to first 5 numerical columns
    for cat_col in categorical_columns:
        plt.figure(figsize=(10, 4))
        sns.boxplot(x=df_sample[cat_col], y=df_sample[num_col])
        plt.title(f'Box Plot of {num_col} by {cat_col}')
        plt.xlabel(cat_col)
        plt.ylabel(num_col)
        plt.show()



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Select a subset of numerical columns for the heatmap (e.g., first 10 columns)
subset_columns = numerical_columns[:10]

# Compute the correlation matrix for the subset
correlation_matrix = df[subset_columns].corr()

# Plot the heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
!pip install pytorch_tabnet

In [None]:
# Necessary imports
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    matthews_corrcoef, log_loss, confusion_matrix
)
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')


In [None]:
# Assuming 'df' is your DataFrame and 'NB_Claim' is the target column
X = df.drop('NB_Claim', axis=1)
y = df['NB_Claim']

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'TabNet': TabNetClassifier()
}

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Initial testing with raw data
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    accuracy = evaluate_model(pipeline, X_train, X_test, y_train, y_test)
    results[name] = accuracy

print("Initial Model Performance with Raw Data and Default Hyperparameters:")
print(results)

# Implement scaling, data normalization, and minor hyperparameter tuning
# Update numerical transformer to use MinMaxScaler for comparison
numerical_transformer_minmax = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

# Combine preprocessing steps with MinMaxScaler
preprocessor_minmax = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_minmax, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

results_minmax = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor_minmax),
                               ('model', model)])
    accuracy = evaluate_model(pipeline, X_train, X_test, y_train, y_test)
    results_minmax[name] = accuracy

print("Model Performance with MinMaxScaler and Default Hyperparameters:")
print(results_minmax)