STEP 1 : DATA CLEANING AND PREPROCESSING

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("Marriage_Divorce_DB.csv")

In [None]:
num_augmentations = 2

# Separate features and target
target_col = 'Divorce Probability'
features = df.drop(columns=[target_col])
target = df[target_col]
noise_std = 0.05

In [None]:
augmented_data = []
augmented_target = []

for _ in range(num_augmentations):
    noise = np.random.normal(loc=0, scale=noise_std, size=features.shape)
    augmented_features = features + noise
    augmented_data.append(augmented_features)
    augmented_target.append(target)  # same targets

In [None]:
X_augmented = pd.concat([features] + augmented_data, ignore_index=True)
y_augmented = pd.concat([target] + augmented_target, ignore_index=True)

# Final augmented DataFrame
df_augmented = X_augmented.copy()
df_augmented[target_col] = y_augmented

In [None]:
df_augmented.to_csv("augmented_marriage_divorce_data.csv", index=False)

print("Original samples:", len(df))
print("Augmented samples:", len(df_augmented))

In [None]:
print(df_augmented.head())

# Show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# To avoid width truncation
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
print(df_augmented.dtypes)

In [None]:
print(df_augmented.info())  # Data types and non-null counts
print(df_augmented.describe())  # Summary statistics for numerical columns

In [None]:
print(df_augmented.isnull().sum()) # Check missing values

# Visualize missing values
sns.heatmap(df_augmented.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Value Heatmap')
plt.show()

In [None]:
# Check and remove duplicates
print("Duplicates:", df_augmented.duplicated().sum())
df.drop_duplicates(inplace=True)

In [None]:
# Boxplot + Outlier detection
numeric_cols = df_augmented.select_dtypes(include=np.number).columns

for col in numeric_cols:
    plt.figure(figsize=(6, 1))
    sns.boxplot(data=df_augmented, x=col)
    plt.title(f'Boxplot: {col}')
    plt.show()

# IQR Outlier Detection Function
def detectoutliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower) | (data[col] > upper)]
    return outliers

# Apply to numeric columns
for col in numeric_cols:
    outliers = detectoutliers_iqr(df_augmented, col)
    print(f"{col}: {len(outliers)} outliers")

In [None]:
from scipy.stats import zscore
z_scores = np.abs(zscore(df_augmented[numeric_cols]))
outliers_z = (z_scores > 3)
print("Z-score based outlier counts:")
print(outliers_z.sum(axis=0))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

#Scale numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_augmented.drop(columns='Divorce Probability'))

#Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
df_augmented['outlier'] = iso.fit_predict(X_scaled)

In [None]:
print("Number of outliers detected:", (df_augmented['outlier'] == -1).sum())
print("Sample of detected outliers:")
print(df_augmented[df_augmented['outlier'] == -1].head())

In [None]:
df_cleaned = df_augmented[df_augmented['outlier'] != -1].drop(columns=['outlier'])
df_cleaned.info()

In [None]:
#Compute the Pearson correlation matrix
correlation_matrix = df_cleaned.corr(method='pearson')

#Visualize with a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Pearson Correlation Heatmap (df_cleaned)')
plt.show()

#Correlation with 'Divorce Probability'
target_col = 'Divorce Probability'
threshold = 0.5  # Adjust this as needed
correlation_target = correlation_matrix[target_col].drop(target_col)

#Sort and filter based on threshold
correlation_sorted = correlation_target.abs().sort_values(ascending=False)
print("Correlation of features with 'Divorce Probability':")
print(correlation_sorted)

selected_features = correlation_sorted[correlation_sorted >= threshold].index.tolist()
print("\nSelected Features (correlation >= 0.1):")
print(selected_features)

# Sort by absolute correlation and select top 10
top_n = 10
correlation_sorted = correlation_target.abs().sort_values(ascending=False)
selected_features = correlation_sorted.head(top_n).index.tolist()

print("Top", top_n, "features most correlated with 'Divorce Probability':")
print(selected_features)

df_selected = df_cleaned[selected_features + ['Divorce Probability']]

In [None]:
print(df_selected)

STEP 2 : EXPLORATORY DATA ANALYSIS

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Display the top 5 row of the preprocessed data
df_selected.head()

In [None]:
# Summary statistics of the preprocessed data
df_selected.describe()

In [None]:
# Correlation Analysis

# Correlation matrix
correlation_matrix = df_selected.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Scatter & Regression Plot

# Scatter plot between Age Gap and Divorce Probability
plt.figure(figsize=(10,8))
sns.scatterplot(x=df_selected['Age Gap'], y=df_selected['Divorce Probability'])
plt.title("Age Gap vs Divorce Probability")
plt.xlabel("Age Gap")
plt.ylabel("Divorce Probability")
plt.show()

# Regression plot between Age Gap and Divorce Probability
plt.figure(figsize=(10,8))
sns.regplot(x=df_selected['Age Gap'], y=df_selected['Divorce Probability'], scatter_kws={'color': 'orange'}, line_kws={'color': 'blue'})
plt.title("Age Gap vs Divorce Probability with Regression Line")
plt.xlabel("Age Gap")
plt.ylabel("Divorce Probability")
plt.show()

In [None]:
# Scatter plot between Education and Divorce Probability
plt.figure(figsize=(10,8))
sns.scatterplot(x=df_selected['Education'], y=df_selected['Divorce Probability'])
plt.title("Education vs Divorce Probability")
plt.xlabel("Education")
plt.ylabel("Divorce Probability")
plt.show()

# Regression plot between Education and Divorce Probability
plt.figure(figsize=(10,8))
sns.regplot(x=df_selected['Education'], y=df_selected['Divorce Probability'], scatter_kws={'color': 'orange'}, line_kws={'color': 'blue'})
plt.title("Education vs Divorce Probability with Regression Line")
plt.xlabel("Education")
plt.ylabel("Divorce Probability")
plt.show()

STEP 3 : MACHINE LEARNING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import make_classification

In [None]:
df_selected['Divorce Probability']= df_selected['Divorce Probability'].apply(np.int64)

In [None]:
df_selected['Divorce Probability'].value_counts()

In [None]:
x=df_selected.drop(['Divorce Probability'],axis=1)
y=df_selected['Divorce Probability']

In [None]:
y=y.apply(np.int64)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, shuffle=True
)

print(x_train.shape,x_test.shape)

In [None]:
stand=StandardScaler()
stand.fit(x_train)

x_train_stand=stand.transform(x_train)
x_test_stand=stand.transform(x_test)

In [None]:
classifier=svm.SVC(kernel='linear')
classifier.fit(x_train,y_train)

In [None]:
x_train_prediction = classifier.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)

x_test_prediction = classifier.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction,y_test)

print('Accuracy score of train data is :',training_data_accuracy)
print('Accuracy score of the test data is :',test_data_accuracy)

In [None]:
classifier_poly=svm.SVC(kernel='poly')
classifier_poly.fit(x_train,y_train)

In [None]:
x_train_prediction_poly = classifier_poly.predict(x_train)
training_data_accuracy_poly = accuracy_score(x_train_prediction_poly,y_train)

x_test_prediction_poly = classifier_poly.predict(x_test)
test_data_accuracy_poly = accuracy_score(x_test_prediction_poly,y_test)

print('Accuracy score of train data is :',training_data_accuracy_poly)
print('Accuracy score of the test data is :',test_data_accuracy_poly)

In [None]:
classifier_rbf=svm.SVC(kernel='rbf')
classifier_rbf.fit(x_train,y_train)

In [None]:
x_train_prediction_rbf = classifier_rbf.predict(x_train)
training_data_accuracy_rbf = accuracy_score(x_train_prediction_rbf,y_train)

x_test_prediction_rbf = classifier_rbf.predict(x_test)
test_data_accuracy_rbf = accuracy_score(x_test_prediction_rbf,y_test)

print('Accuracy score of train data is :',training_data_accuracy_rbf)
print('Accuracy score of the test data is :',test_data_accuracy_rbf)

In [None]:
# GridSearchCV
# Define the parameter grid

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=2)
grid.fit(x_train, y_train)

# Best model and its performance
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

STEP 6: EVALUATION

In [None]:
y_pred = best_model.predict(x_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (RÂ²) Score: {r2:.2f}")
